diff --git a/.gitignore b/.gitignore index 0fe114d251..8a5270973e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,65 +1,65 @@ -config-devices.* -config-all-devices.* -config-all-disas.* -config-host.* -config-target.* -trace/generated-tracers.h -trace/generated-tracers.c -trace/generated-tracers-dtrace.h -trace/generated-tracers.dtrace -trace/generated-events.h -trace/generated-events.c -libcacard/trace/generated-tracers.c +/config-devices.* +/config-all-devices.* +/config-all-disas.* +/config-host.* +/config-target.* +/config.status +/trace/generated-tracers.h +/trace/generated-tracers.c +/trace/generated-tracers-dtrace.h +/trace/generated-tracers.dtrace +/trace/generated-events.h +/trace/generated-events.c +/trace/generated-ust-provider.h +/trace/generated-ust.c +/libcacard/trace/generated-tracers.c *-timestamp -*-softmmu -*-darwin-user -*-linux-user -*-bsd-user -libdis* -libuser -linux-headers/asm -qapi-generated -qapi-types.[ch] -qapi-visit.[ch] -qmp-commands.h -qmp-marshal.c -qemu-doc.html -qemu-tech.html -qemu-doc.info -qemu-tech.info -qemu.1 -qemu.pod -qemu-img.1 -qemu-img.pod -qemu-img -qemu-nbd -qemu-nbd.8 -qemu-nbd.pod -qemu-options.def -qemu-options.texi -qemu-img-cmds.texi -qemu-img-cmds.h -qemu-io -qemu-ga -qemu-bridge-helper -qemu-monitor.texi -vscclient -QMP/qmp-commands.txt -test-coroutine -test-qmp-input-visitor -test-qmp-output-visitor -test-string-input-visitor -test-string-output-visitor -test-visitor-serialization -fsdev/virtfs-proxy-helper -fsdev/virtfs-proxy-helper.1 -fsdev/virtfs-proxy-helper.pod -.gdbinit +/*-softmmu +/*-darwin-user +/*-linux-user +/*-bsd-user +/libdis* +/libuser +/linux-headers/asm +/qga/qapi-generated +/qapi-generated +/qapi-types.[ch] +/qapi-visit.[ch] +/qmp-commands.h +/qmp-marshal.c +/qemu-doc.html +/qemu-tech.html +/qemu-doc.info +/qemu-tech.info +/qemu.1 +/qemu.pod +/qemu-img.1 +/qemu-img.pod +/qemu-img +/qemu-nbd +/qemu-nbd.8 +/qemu-nbd.pod +/qemu-options.def +/qemu-options.texi +/qemu-img-cmds.texi +/qemu-img-cmds.h +/qemu-io +/qemu-ga +/qemu-bridge-helper +/qemu-monitor.texi +/qmp-commands.txt +/vscclient +/fsdev/virtfs-proxy-helper +/fsdev/virtfs-proxy-helper.1 +/fsdev/virtfs-proxy-helper.pod *.a *.aux *.cp *.dvi *.exe +*.dll +*.so +*.mo *.fn *.ky *.log @@ -73,34 +73,31 @@ fsdev/virtfs-proxy-helper.pod *.tp *.vr *.d -!scripts/qemu-guest-agent/fsfreeze-hook.d +!/scripts/qemu-guest-agent/fsfreeze-hook.d *.o *.lo *.la *.pc .libs -*.swp -*.orig -.pc +.sdk *.gcda *.gcno -patches -pc-bios/bios-pq/status -pc-bios/vgabios-pq/status -pc-bios/optionrom/linuxboot.asm -pc-bios/optionrom/linuxboot.bin -pc-bios/optionrom/linuxboot.raw -pc-bios/optionrom/linuxboot.img -pc-bios/optionrom/multiboot.asm -pc-bios/optionrom/multiboot.bin -pc-bios/optionrom/multiboot.raw -pc-bios/optionrom/multiboot.img -pc-bios/optionrom/kvmvapic.asm -pc-bios/optionrom/kvmvapic.bin -pc-bios/optionrom/kvmvapic.raw -pc-bios/optionrom/kvmvapic.img -pc-bios/s390-ccw/s390-ccw.elf -pc-bios/s390-ccw/s390-ccw.img +/pc-bios/bios-pq/status +/pc-bios/vgabios-pq/status +/pc-bios/optionrom/linuxboot.asm +/pc-bios/optionrom/linuxboot.bin +/pc-bios/optionrom/linuxboot.raw +/pc-bios/optionrom/linuxboot.img +/pc-bios/optionrom/multiboot.asm +/pc-bios/optionrom/multiboot.bin +/pc-bios/optionrom/multiboot.raw +/pc-bios/optionrom/multiboot.img +/pc-bios/optionrom/kvmvapic.asm +/pc-bios/optionrom/kvmvapic.bin +/pc-bios/optionrom/kvmvapic.raw +/pc-bios/optionrom/kvmvapic.img +/pc-bios/s390-ccw/s390-ccw.elf +/pc-bios/s390-ccw/s390-ccw.img .stgit-* cscope.* tags diff --git a/.gitmodules b/.gitmodules index d7e3f3c7cd..444c24a993 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,27 +1,30 @@ [submodule "roms/vgabios"] path = roms/vgabios - url = git://git.qemu.org/vgabios.git/ + url = git://git.qemu-project.org/vgabios.git/ [submodule "roms/seabios"] path = roms/seabios - url = git://git.qemu.org/seabios.git/ + url = git://git.qemu-project.org/seabios.git/ [submodule "roms/SLOF"] path = roms/SLOF - url = git://git.qemu.org/SLOF.git + url = git://git.qemu-project.org/SLOF.git [submodule "roms/ipxe"] path = roms/ipxe - url = git://git.qemu.org/ipxe.git + url = git://git.qemu-project.org/ipxe.git [submodule "roms/openbios"] path = roms/openbios - url = git://git.qemu.org/openbios.git + url = git://git.qemu-project.org/openbios.git +[submodule "roms/openhackware"] + path = roms/openhackware + url = git://git.qemu-project.org/openhackware.git [submodule "roms/qemu-palcode"] path = roms/qemu-palcode url = git://github.com/rth7680/qemu-palcode.git [submodule "roms/sgabios"] path = roms/sgabios - url = git://git.qemu.org/sgabios.git + url = git://git.qemu-project.org/sgabios.git [submodule "pixman"] path = pixman url = git://anongit.freedesktop.org/pixman [submodule "dtc"] path = dtc - url = git://git.qemu.org/dtc.git + url = git://git.qemu-project.org/dtc.git diff --git a/.mailmap b/.mailmap index 9797802aaa..28defa1b2c 100644 --- a/.mailmap +++ b/.mailmap @@ -2,7 +2,8 @@ # into proper addresses so that they are counted properly in git shortlog output. # Andrzej Zaborowski balrog -Anthony Liguori aliguori +Anthony Liguori aliguori +Anthony Liguori Anthony Liguori Aurelien Jarno aurel32 Blue Swirl blueswir1 Edgar E. Iglesias edgar_igl diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000..04da973632 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,81 @@ +language: c +python: + - "2.4" +compiler: + - gcc + - clang +notifications: + irc: + channels: + - "irc.oftc.net#qemu" + on_success: change + on_failure: always +env: + global: + - TEST_CMD="make check" + - EXTRA_CONFIG="" + # Development packages, EXTRA_PKGS saved for additional builds + - CORE_PKGS="libusb-1.0-0-dev libiscsi-dev librados-dev libncurses5-dev" + - NET_PKGS="libseccomp-dev libgnutls-dev libssh2-1-dev libspice-server-dev libspice-protocol-dev libnss3-dev" + - GUI_PKGS="libgtk-3-dev libvte-2.90-dev libsdl1.2-dev libpng12-dev libpixman-1-dev" + - EXTRA_PKGS="" + matrix: + - TARGETS=alpha-softmmu,alpha-linux-user + - TARGETS=arm-softmmu,arm-linux-user + - TARGETS=aarch64-softmmu,aarch64-linux-user + - TARGETS=cris-softmmu + - TARGETS=i386-softmmu,x86_64-softmmu + - TARGETS=lm32-softmmu + - TARGETS=m68k-softmmu + - TARGETS=microblaze-softmmu,microblazeel-softmmu + - TARGETS=mips-softmmu,mips64-softmmu,mips64el-softmmu,mipsel-softmmu + - TARGETS=moxie-softmmu + - TARGETS=or32-softmmu, + - TARGETS=ppc-softmmu,ppc64-softmmu,ppcemb-softmmu + - TARGETS=s390x-softmmu + - TARGETS=sh4-softmmu,sh4eb-softmmu + - TARGETS=sparc-softmmu,sparc64-softmmu + - TARGETS=unicore32-softmmu + - TARGETS=xtensa-softmmu,xtensaeb-softmmu +before_install: + - git submodule update --init --recursive + - sudo apt-get update -qq + - sudo apt-get install -qq ${CORE_PKGS} ${NET_PKGS} ${GUI_PKGS} ${EXTRA_PKGS} +script: "./configure --target-list=${TARGETS} ${EXTRA_CONFIG} && make && ${TEST_CMD}" +matrix: + # We manually include a number of additional build for non-standard bits + include: + # Debug related options + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_CONFIG="--enable-debug" + compiler: gcc + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_CONFIG="--enable-debug --enable-tcg-interpreter" + compiler: gcc + # All the extra -dev packages + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_PKGS="libaio-dev libcap-ng-dev libattr1-dev libbrlapi-dev uuid-dev libusb-1.0.0-dev" + compiler: gcc + # Currently configure doesn't force --disable-pie + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_CONFIG="--enable-gprof --enable-gcov --disable-pie" + compiler: gcc + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_PKGS="sparse" + EXTRA_CONFIG="--enable-sparse" + compiler: gcc + # All the trace backends (apart from dtrace) + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_CONFIG="--enable-trace-backend=stderr" + compiler: gcc + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_CONFIG="--enable-trace-backend=simple" + compiler: gcc + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_CONFIG="--enable-trace-backend=ftrace" + TEST_CMD="" + compiler: gcc + - env: TARGETS=i386-softmmu,x86_64-softmmu + EXTRA_PKGS="liblttng-ust-dev liburcu-dev" + EXTRA_CONFIG="--enable-trace-backend=ust" + compiler: gcc diff --git a/CODING_STYLE b/CODING_STYLE index dcbce28a27..4280945ff0 100644 --- a/CODING_STYLE +++ b/CODING_STYLE @@ -84,3 +84,10 @@ and clarity it comes on a line by itself: Rationale: a consistent (except for functions...) bracing style reduces ambiguity and avoids needless churn when lines are added or removed. Furthermore, it is the QEMU coding style. + +5. Declarations + +Mixed declarations (interleaving statements and declarations within blocks) +are not allowed; declarations should be at the beginning of blocks. In other +words, the code should not generate warnings if using GCC's +-Wdeclaration-after-statement option. diff --git a/Changelog b/Changelog index 13eebefb74..1249b8aac5 100644 --- a/Changelog +++ b/Changelog @@ -1,6 +1,6 @@ This file documents changes for QEMU releases 0.12 and earlier. For changelog information for later releases, see -http://wiki.qemu.org/ChangeLog or look at the git history for +http://wiki.qemu-project.org/ChangeLog or look at the git history for more detailed information. diff --git a/MAINTAINERS b/MAINTAINERS index 654e2cb410..97c9fa1f7f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -50,8 +50,14 @@ Descriptions of section entries: General Project Administration ------------------------------ -M: Anthony Liguori -M: Paul Brook +M: Anthony Liguori + +Responsible Disclosure, Reporting Security Issues +------------------------------ +W: http://wiki.qemu.org/SecurityProcess +M: Michael S. Tsirkin +M: Anthony Liguori +L: secalert@redhat.com Guest CPU cores (TCG): ---------------------- @@ -62,7 +68,6 @@ F: target-alpha/ F: hw/alpha/ ARM -M: Paul Brook M: Peter Maydell S: Maintained F: target-arm/ @@ -83,8 +88,7 @@ F: hw/lm32/ F: hw/char/lm32_* M68K -M: Paul Brook -S: Odd Fixes +S: Orphan F: target-m68k/ F: hw/m68k/ @@ -161,7 +165,6 @@ Guest CPU Cores (KVM): ---------------------- Overall -M: Gleb Natapov M: Paolo Bonzini L: kvm@vger.kernel.org S: Supported @@ -179,12 +182,14 @@ S: Maintained F: target-ppc/kvm.c S390 +M: Christian Borntraeger +M: Cornelia Huck M: Alexander Graf S: Maintained F: target-s390x/kvm.c +F: hw/intc/s390_flic.[hc] X86 -M: Gleb Natapov M: Marcelo Tosatti L: kvm@vger.kernel.org S: Supported @@ -222,6 +227,13 @@ F: *win32* ARM Machines ------------ +Allwinner-a10 +M: Li Guang +S: Maintained +F: hw/*/allwinner-a10* +F: include/hw/*/allwinner-a10* +F: hw/arm/cubieboard.c + Exynos M: Evgeny Voevodin M: Maksim Kozlov @@ -236,6 +248,12 @@ S: Supported F: hw/arm/highbank.c F: hw/net/xgmac.c +Canon DIGIC +M: Antony Pavlov +S: Maintained +F: include/hw/arm/digic.h +F: hw/*/digic* + Gumstix M: qemu-devel@nongnu.org S: Orphan @@ -248,7 +266,6 @@ F: hw/*/imx* F: hw/arm/kzm.c Integrator CP -M: Paul Brook M: Peter Maydell S: Maintained F: hw/arm/integratorcp.c @@ -274,7 +291,6 @@ S: Maintained F: hw/arm/palm.c Real View -M: Paul Brook M: Peter Maydell S: Maintained F: hw/arm/realview* @@ -285,19 +301,17 @@ S: Maintained F: hw/arm/spitz.c Stellaris -M: Paul Brook M: Peter Maydell S: Maintained F: hw/*/stellaris* Versatile PB -M: Paul Brook M: Peter Maydell S: Maintained F: hw/*/versatile* Xilinx Zynq -M: Peter Crosthwaite +M: Peter Crosthwaite S: Maintained F: hw/arm/xilinx_zynq.c F: hw/misc/zynq_slcr.c @@ -327,18 +341,15 @@ F: hw/lm32/milkymist.c M68K Machines ------------- an5206 -M: Paul Brook -S: Maintained +S: Orphan F: hw/m68k/an5206.c dummy_m68k -M: Paul Brook -S: Maintained +S: Orphan F: hw/m68k/dummy_m68k.c mcf5208 -M: Paul Brook -S: Maintained +S: Orphan F: hw/m68k/mcf5208.c MicroBlaze Machines @@ -349,7 +360,7 @@ S: Maintained F: hw/microblaze/petalogix_s3adsp1800_mmu.c petalogix_ml605 -M: Peter Crosthwaite +M: Peter Crosthwaite S: Maintained F: hw/microblaze/petalogix_ml605_mmu.c @@ -492,10 +503,13 @@ F: hw/s390x/s390-*.c S390 Virtio-ccw M: Cornelia Huck +M: Christian Borntraeger M: Alexander Graf S: Supported F: hw/s390x/s390-virtio-ccw.c F: hw/s390x/css.[hc] +F: hw/s390x/sclp*.[hc] +F: hw/s390x/ipl*.[hc] T: git git://github.com/cohuck/qemu virtio-ccw-upstr UniCore32 Machines @@ -509,10 +523,24 @@ F: hw/unicore32/ X86 Machines ------------ PC -M: Anthony Liguori +M: Anthony Liguori +M: Michael S. Tsirkin S: Supported -F: hw/i386/pc.[ch] -F: hw/i386/pc_piix.c +F: include/hw/i386/ +F: hw/i386/ +F: hw/pci-host/piix.c +F: hw/pci-host/q35.c +F: hw/pci-host/pam.c +F: include/hw/pci-host/q35.h +F: include/hw/pci-host/pam.h +F: hw/isa/piix4.c +F: hw/isa/lpc_ich9.c +F: hw/i2c/smbus_ich9.c +F: hw/acpi/piix4.c +F: hw/acpi/ich9.c +F: include/hw/acpi/ich9.h +F: include/hw/acpi/piix.h + Xtensa Machines --------------- @@ -567,12 +595,11 @@ F: hw/scsi/* T: git git://github.com/bonzini/qemu.git scsi-next LSI53C895A -M: Paul Brook -S: Odd Fixes +S: Orphan F: hw/scsi/lsi53c895a.c SSI -M: Peter Crosthwaite +M: Peter Crosthwaite S: Maintained F: hw/ssi/* F: hw/block/m25p80.c @@ -581,6 +608,7 @@ USB M: Gerd Hoffmann S: Maintained F: hw/usb/* +F: tests/usb-hcd-ehci-test.c VFIO M: Alex Williamson @@ -593,7 +621,8 @@ S: Supported F: hw/*/*vhost* virtio -M: Anthony Liguori +M: Anthony Liguori +M: Michael S. Tsirkin S: Supported F: hw/*/virtio* @@ -602,6 +631,7 @@ M: Aneesh Kumar K.V S: Supported F: hw/9pfs/ F: fsdev/ +F: tests/virtio-9p-test.c T: git git://github.com/kvaneesh/QEMU.git virtio-blk @@ -612,6 +642,7 @@ F: hw/block/virtio-blk.c virtio-ccw M: Cornelia Huck +M: Christian Borntraeger S: Supported F: hw/s390x/virtio-ccw.[hc] T: git git://github.com/cohuck/qemu virtio-ccw-upstr @@ -626,9 +657,10 @@ nvme M: Keith Busch S: Supported F: hw/block/nvme* +F: tests/nvme-test.c Xilinx EDK -M: Peter Crosthwaite +M: Peter Crosthwaite M: Edgar E. Iglesias S: Maintained F: hw/*/xilinx_* @@ -638,9 +670,13 @@ Subsystems ---------- Audio M: Vassili Karpov (malc) +M: Gerd Hoffmann S: Maintained F: audio/ F: hw/audio/ +F: tests/ac97-test.c +F: tests/es1370-test.c +F: tests/intel-hda-test.c Block M: Kevin Wolf @@ -649,9 +685,13 @@ S: Supported F: block* F: block/ F: hw/block/ +F: qemu-img* +F: qemu-io* +T: git git://repo.or.cz/qemu/kevin.git block +T: git git://github.com/stefanha/qemu.git block Character Devices -M: Anthony Liguori +M: Anthony Liguori S: Maintained F: qemu-char.c @@ -669,7 +709,7 @@ F: include/hw/cpu/icc_bus.h F: hw/cpu/icc_bus.c Device Tree -M: Peter Crosthwaite +M: Peter Crosthwaite M: Alexander Graf S: Maintained F: device_tree.[ch] @@ -689,34 +729,45 @@ F: audio/spiceaudio.c F: hw/display/qxl* Graphics -M: Anthony Liguori -S: Maintained +M: Anthony Liguori +M: Gerd Hoffmann +S: Odd Fixes F: ui/ Cocoa graphics M: Andreas Färber +M: Peter Maydell S: Odd Fixes F: ui/cocoa.m Main loop -M: Anthony Liguori +M: Anthony Liguori S: Supported F: vl.c Human Monitor (HMP) M: Luiz Capitulino -S: Supported +S: Maintained F: monitor.c F: hmp.c F: hmp-commands.hx +T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp Network device layer -M: Anthony Liguori +M: Anthony Liguori M: Stefan Hajnoczi S: Maintained F: net/ T: git git://github.com/stefanha/qemu.git net +Netmap network backend +M: Luigi Rizzo +M: Giuseppe Lettieri +M: Vincenzo Maffione +W: http://info.iet.unipi.it/~luigi/netmap/ +S: Maintained +F: net/netmap.c + Network Block Device (NBD) M: Paolo Bonzini S: Odd Fixes @@ -728,8 +779,9 @@ T: git git://github.com/bonzini/qemu.git nbd-next QAPI M: Luiz Capitulino M: Michael Roth -S: Supported +S: Maintained F: qapi/ +T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp QAPI Schema M: Eric Blake @@ -737,14 +789,27 @@ M: Luiz Capitulino M: Markus Armbruster S: Supported F: qapi-schema.json +T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp + +QOM +M: Anthony Liguori +M: Andreas Färber +S: Supported +T: git git://github.com/afaerber/qemu-cpu.git qom-next +F: include/qom/ +X: include/qom/cpu.h +F: qom/ +X: qom/cpu.c +F: tests/qom-test.c QMP M: Luiz Capitulino -S: Supported +S: Maintained F: qmp.c F: monitor.c F: qmp-commands.hx F: QMP/ +T: git git://repo.or.cz/qemu/qmp-unstable.git queue/qmp SLIRP M: Jan Kiszka @@ -766,6 +831,12 @@ M: Blue Swirl S: Odd Fixes F: scripts/checkpatch.pl +Seccomp +M: Eduardo Otubo +S: Supported +F: qemu-seccomp.c +F: include/sysemu/seccomp.h + Usermode Emulation ------------------ BSD user @@ -797,11 +868,6 @@ M: Andrzej Zaborowski S: Maintained F: tcg/arm/ -HPPA target -M: Richard Henderson -S: Maintained -F: tcg/hppa/ - i386 target M: qemu-devel@nongnu.org S: Maintained @@ -842,25 +908,73 @@ TCI target M: Stefan Weil S: Maintained F: tcg/tci/ +F: tci.c Stable branches --------------- Stable 1.0 L: qemu-stable@nongnu.org -T: git git://git.qemu.org/qemu-stable-1.0.git +T: git git://git.qemu-project.org/qemu-stable-1.0.git S: Orphan Stable 0.15 L: qemu-stable@nongnu.org -T: git git://git.qemu.org/qemu-stable-0.15.git -S: Orphan +M: Andreas Färber +T: git git://git.qemu-project.org/qemu-stable-0.15.git +S: Supported Stable 0.14 L: qemu-stable@nongnu.org -T: git git://git.qemu.org/qemu-stable-0.14.git +T: git git://git.qemu-project.org/qemu-stable-0.14.git S: Orphan Stable 0.10 L: qemu-stable@nongnu.org -T: git git://git.qemu.org/qemu-stable-0.10.git +T: git git://git.qemu-project.org/qemu-stable-0.10.git S: Orphan + +Block drivers +------------- +VMDK +M: Fam Zheng +S: Supported +F: block/vmdk.c + +RBD +M: Josh Durgin +S: Supported +F: block/rbd.c + +Sheepdog +M: MORITA Kazutaka +M: Liu Yuan +L: sheepdog@lists.wpkg.org +S: Supported +F: block/sheepdog.c + +VHDX +M: Jeff Cody +S: Supported +F: block/vhdx* + +VDI +M: Stefan Weil +S: Maintained +F: block/vdi.c + +iSCSI +M: Ronnie Sahlberg +M: Paolo Bonzini +M: Peter Lieven +S: Supported +F: block/iscsi.c + +NFS +M: Peter Lieven +S: Maintained +F: block/nfs.c + +SSH +M: Richard W.M. Jones +S: Supported +F: block/ssh.c diff --git a/Makefile b/Makefile index 4d257f1a52..d830483685 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,14 @@ CONFIG_ALL=y include $(SRC_PATH)/rules.mak config-host.mak: $(SRC_PATH)/configure @echo $@ is out-of-date, running configure - @sed -n "/.*Configured with/s/[^:]*: //p" $@ | sh + @# TODO: The next lines include code which supports a smooth + @# transition from old configurations without config.status. + @# This code can be removed after QEMU 1.7. + @if test -x config.status; then \ + ./config.status; \ + else \ + sed -n "/.*Configured with/s/[^:]*: //p" $@ | sh; \ + fi else config-host.mak: ifneq ($(filter-out %clean,$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail)) @@ -50,6 +57,11 @@ GENERATED_HEADERS += trace/generated-tracers-dtrace.h endif GENERATED_SOURCES += trace/generated-tracers.c +ifeq ($(TRACE_BACKEND),ust) +GENERATED_HEADERS += trace/generated-ust-provider.h +GENERATED_SOURCES += trace/generated-ust.c +endif + # Don't try to regenerate Makefile or configure # We don't generate any of them Makefile: ; @@ -65,7 +77,7 @@ LIBS+=-lz $(LIBS_TOOLS) HELPERS-$(CONFIG_LINUX) = qemu-bridge-helper$(EXESUF) ifdef BUILD_DOCS -DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 QMP/qmp-commands.txt +DOCS=qemu-doc.html qemu-tech.html qemu.1 qemu-img.1 qemu-nbd.8 qmp-commands.txt ifdef CONFIG_VIRTFS DOCS+=fsdev/virtfs-proxy-helper.1 endif @@ -115,13 +127,26 @@ defconfig: ifneq ($(wildcard config-host.mak),) include $(SRC_PATH)/Makefile.objs +endif + +dummy := $(call unnest-vars,, \ + stub-obj-y \ + util-obj-y \ + qga-obj-y \ + qga-vss-dll-obj-y \ + block-obj-y \ + block-obj-m \ + common-obj-y \ + common-obj-m) + +ifneq ($(wildcard config-host.mak),) include $(SRC_PATH)/tests/Makefile endif ifeq ($(CONFIG_SMARTCARD_NSS),y) include $(SRC_PATH)/libcacard/Makefile endif -all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all +all: $(DOCS) $(TOOLS) $(HELPERS-y) recurse-all modules config-host.h: config-host.h-timestamp config-host.h-timestamp: config-host.mak @@ -131,6 +156,7 @@ qemu-options.def: $(SRC_PATH)/qemu-options.hx SUBDIR_RULES=$(patsubst %,subdir-%, $(TARGET_DIRS)) SOFTMMU_SUBDIR_RULES=$(filter %-softmmu,$(SUBDIR_RULES)) +$(SOFTMMU_SUBDIR_RULES): $(block-obj-y) $(SOFTMMU_SUBDIR_RULES): config-all-devices.mak subdir-%: @@ -165,10 +191,10 @@ ALL_SUBDIRS=$(TARGET_DIRS) $(patsubst %,pc-bios/%, $(ROMS)) recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR_RULES) -bt-host.o: QEMU_CFLAGS += $(BLUEZ_CFLAGS) - $(BUILD_DIR)/version.o: $(SRC_PATH)/version.rc $(BUILD_DIR)/config-host.h | $(BUILD_DIR)/version.lo + $(call quiet-command,$(WINDRES) -I$(BUILD_DIR) -o $@ $<," RC version.o") $(BUILD_DIR)/version.lo: $(SRC_PATH)/version.rc $(BUILD_DIR)/config-host.h + $(call quiet-command,$(WINDRES) -I$(BUILD_DIR) -o $@ $<," RC version.lo") Makefile: $(version-obj-y) $(version-lobj-y) @@ -178,6 +204,9 @@ Makefile: $(version-obj-y) $(version-lobj-y) libqemustub.a: $(stub-obj-y) libqemuutil.a: $(util-obj-y) qapi-types.o qapi-visit.o +block-modules = $(foreach o,$(block-obj-m),"$(basename $(subst /,-,$o))",) NULL +util/module.o-cflags = -D'CONFIG_BLOCK_MODULES=$(block-modules)' + ###################################################################### qemu-img.o: qemu-img-cmds.h @@ -203,23 +232,35 @@ qapi-py = $(SRC_PATH)/scripts/qapi.py $(SRC_PATH)/scripts/ordereddict.py qga/qapi-generated/qga-qapi-types.c qga/qapi-generated/qga-qapi-types.h :\ $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py) - $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py $(gen-out-type) -o qga/qapi-generated -p "qga-" < $<, " GEN $@") + $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py \ + $(gen-out-type) -o qga/qapi-generated -p "qga-" -i $<, \ + " GEN $@") qga/qapi-generated/qga-qapi-visit.c qga/qapi-generated/qga-qapi-visit.h :\ $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py) - $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py $(gen-out-type) -o qga/qapi-generated -p "qga-" < $<, " GEN $@") + $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py \ + $(gen-out-type) -o qga/qapi-generated -p "qga-" -i $<, \ + " GEN $@") qga/qapi-generated/qga-qmp-commands.h qga/qapi-generated/qga-qmp-marshal.c :\ $(SRC_PATH)/qga/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py) - $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py $(gen-out-type) -o qga/qapi-generated -p "qga-" < $<, " GEN $@") + $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py \ + $(gen-out-type) -o qga/qapi-generated -p "qga-" -i $<, \ + " GEN $@") qapi-types.c qapi-types.h :\ $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py) - $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py $(gen-out-type) -o "." -b < $<, " GEN $@") + $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py \ + $(gen-out-type) -o "." -b -i $<, \ + " GEN $@") qapi-visit.c qapi-visit.h :\ $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/scripts/qapi-visit.py $(qapi-py) - $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py $(gen-out-type) -o "." -b < $<, " GEN $@") + $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-visit.py \ + $(gen-out-type) -o "." -b -i $<, \ + " GEN $@") qmp-commands.h qmp-marshal.c :\ $(SRC_PATH)/qapi-schema.json $(SRC_PATH)/scripts/qapi-commands.py $(qapi-py) - $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py $(gen-out-type) -m -o "." < $<, " GEN $@") + $(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-commands.py \ + $(gen-out-type) -o "." -m -i $<, \ + " GEN $@") QGALIB_GEN=$(addprefix qga/qapi-generated/, qga-qapi-types.h qga-qapi-visit.h qga-qmp-commands.h) $(qga-obj-y) qemu-ga.o: $(QGALIB_GEN) @@ -231,10 +272,10 @@ clean: # avoid old build problems by removing potentially incorrect old files rm -f config.mak op-i386.h opc-i386.h gen-op-i386.h op-arm.h opc-arm.h gen-op-arm.h rm -f qemu-options.def - find . -name '*.[oda]' -type f -exec rm -f {} + - find . -name '*.l[oa]' -type f -exec rm -f {} + - rm -f $(TOOLS) $(HELPERS-y) qemu-ga TAGS cscope.* *.pod *~ */*~ - rm -Rf .libs + find . \( -name '*.l[oa]' -o -name '*.so' -o -name '*.dll' -o -name '*.mo' -o -name '*.[oda]' \) -type f -exec rm {} + + rm -f $(filter-out %.tlb,$(TOOLS)) $(HELPERS-y) qemu-ga TAGS cscope.* *.pod *~ */*~ + rm -f fsdev/*.pod + rm -rf .libs */.libs rm -f qemu-img-cmds.h @# May not be present in GENERATED_HEADERS rm -f trace/generated-tracers-dtrace.dtrace* @@ -243,7 +284,6 @@ clean: rm -f $(foreach f,$(GENERATED_SOURCES),$(f) $(f)-timestamp) rm -rf qapi-generated rm -rf qga/qapi-generated - $(MAKE) -C tests/tcg clean for d in $(ALL_SUBDIRS); do \ if test -d $$d; then $(MAKE) -C $$d $@ || exit 1; fi; \ rm -f $$d/qemu-options.def; \ @@ -259,6 +299,7 @@ qemu-%.tar.bz2: distclean: clean rm -f config-host.mak config-host.h* config-host.ld $(DOCS) qemu-options.texi qemu-img-cmds.texi qemu-monitor.texi rm -f config-all-devices.mak config-all-disas.mak + rm -f po/*.mo rm -f roms/seabios/config.mak roms/vgabios/config.mak rm -f qemu-doc.info qemu-doc.aux qemu-doc.cp qemu-doc.cps qemu-doc.dvi rm -f qemu-doc.fn qemu-doc.fns qemu-doc.info qemu-doc.ky qemu-doc.kys @@ -270,19 +311,20 @@ distclean: clean for d in $(TARGET_DIRS); do \ rm -rf $$d || exit 1 ; \ done + rm -Rf .sdk if test -f pixman/config.log; then make -C pixman distclean; fi if test -f dtc/version_gen.h; then make $(DTC_MAKE_ARGS) clean; fi KEYMAPS=da en-gb et fr fr-ch is lt modifiers no pt-br sv \ ar de en-us fi fr-be hr it lv nl pl ru th \ common de-ch es fo fr-ca hu ja mk nl-be pt sl tr \ -bepo +bepo cz ifdef INSTALL_BLOBS -BLOBS=bios.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \ +BLOBS=bios.bin bios-256k.bin sgabios.bin vgabios.bin vgabios-cirrus.bin \ vgabios-stdvga.bin vgabios-vmware.bin vgabios-qxl.bin \ acpi-dsdt.aml q35-acpi-dsdt.aml \ -ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc \ +ppc_rom.bin openbios-sparc32 openbios-sparc64 openbios-ppc QEMU,tcx.bin QEMU,cgthree.bin \ pxe-e1000.rom pxe-eepro100.rom pxe-ne2k_pci.rom \ pxe-pcnet.rom pxe-rtl8139.rom pxe-virtio.rom \ efi-e1000.rom efi-eepro100.rom efi-ne2k_pci.rom \ @@ -301,7 +343,7 @@ endif install-doc: $(DOCS) $(INSTALL_DIR) "$(DESTDIR)$(qemu_docdir)" $(INSTALL_DATA) qemu-doc.html qemu-tech.html "$(DESTDIR)$(qemu_docdir)" - $(INSTALL_DATA) QMP/qmp-commands.txt "$(DESTDIR)$(qemu_docdir)" + $(INSTALL_DATA) qmp-commands.txt "$(DESTDIR)$(qemu_docdir)" ifdef CONFIG_POSIX $(INSTALL_DIR) "$(DESTDIR)$(mandir)/man1" $(INSTALL_DATA) qemu.1 "$(DESTDIR)$(mandir)/man1" @@ -336,11 +378,25 @@ install: all $(if $(BUILD_DOCS),install-doc) install-sysconfig \ install-datadir install-localstatedir $(INSTALL_DIR) "$(DESTDIR)$(bindir)" ifneq ($(TOOLS),) - $(INSTALL_PROG) $(STRIP_OPT) $(TOOLS) "$(DESTDIR)$(bindir)" + $(INSTALL_PROG) $(TOOLS) "$(DESTDIR)$(bindir)" +ifneq ($(STRIP),) + $(STRIP) $(TOOLS:%="$(DESTDIR)$(bindir)/%") +endif +endif +ifneq ($(CONFIG_MODULES),) + $(INSTALL_DIR) "$(DESTDIR)$(qemu_moddir)" + for s in $(modules-m:.mo=$(DSOSUF)); do \ + t="$(DESTDIR)$(qemu_moddir)/$$(echo $$s | tr / -)"; \ + $(INSTALL_LIB) $$s "$$t"; \ + test -z "$(STRIP)" || $(STRIP) "$$t"; \ + done endif ifneq ($(HELPERS-y),) $(INSTALL_DIR) "$(DESTDIR)$(libexecdir)" - $(INSTALL_PROG) $(STRIP_OPT) $(HELPERS-y) "$(DESTDIR)$(libexecdir)" + $(INSTALL_PROG) $(HELPERS-y) "$(DESTDIR)$(libexecdir)" +ifneq ($(STRIP),) + $(STRIP) $(HELPERS-y:%="$(DESTDIR)$(libexecdir)/%") +endif endif ifneq ($(BLOBS),) set -e; for x in $(BLOBS); do \ @@ -355,7 +411,7 @@ endif $(INSTALL_DATA) $(SRC_PATH)/pc-bios/keymaps/$$x "$(DESTDIR)$(qemu_datadir)/keymaps"; \ done for d in $(TARGET_DIRS); do \ - $(MAKE) -C $$d $@ || exit 1 ; \ + $(MAKE) $(SUBDIR_MAKEFLAGS) TARGET_DIR=$$d/ -C $$d $@ || exit 1 ; \ done # various test targets @@ -395,7 +451,7 @@ qemu-options.texi: $(SRC_PATH)/qemu-options.hx qemu-monitor.texi: $(SRC_PATH)/hmp-commands.hx $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -t < $< > $@," GEN $@") -QMP/qmp-commands.txt: $(SRC_PATH)/qmp-commands.hx +qmp-commands.txt: $(SRC_PATH)/qmp-commands.hx $(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -q < $< > $@," GEN $@") qemu-img-cmds.texi: $(SRC_PATH)/qemu-img-cmds.hx diff --git a/Makefile.objs b/Makefile.objs index f46a4cdd6a..b897e1dc34 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -19,11 +19,8 @@ block-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o block-obj-y += qemu-coroutine-sleep.o block-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o -ifeq ($(CONFIG_VIRTIO)$(CONFIG_VIRTFS)$(CONFIG_PCI),yyy) -# Lots of the fsdev/9pcode is pulled in by vl.c via qemu_fsdev_add. -# only pull in the actual virtio-9p device if we also enabled virtio. -CONFIG_REALLY_VIRTFS=y -endif +block-obj-m = block/ + ###################################################################### # smartcard @@ -34,6 +31,8 @@ libcacard-y += libcacard/vcard_emul_nss.o libcacard-y += libcacard/vcard_emul_type.o libcacard-y += libcacard/card_7816.o libcacard-y += libcacard/vcardt.o +libcacard/vcard_emul_nss.o-cflags := $(NSS_CFLAGS) +libcacard/vcard_emul_nss.o-libs := $(NSS_LIBS) ###################################################################### # Target independent part of system emulation. The long term path is to @@ -41,9 +40,9 @@ libcacard-y += libcacard/vcardt.o # single QEMU executable should support all CPUs and machines. ifeq ($(CONFIG_SOFTMMU),y) -common-obj-y = $(block-obj-y) blockdev.o blockdev-nbd.o block/ +common-obj-y = blockdev.o blockdev-nbd.o block/ +common-obj-y += iothread.o common-obj-y += net/ -common-obj-y += readline.o common-obj-y += qdev-monitor.o device-hotplug.o common-obj-$(CONFIG_WIN32) += os-win32.o common-obj-$(CONFIG_POSIX) += os-posix.o @@ -51,6 +50,8 @@ common-obj-$(CONFIG_POSIX) += os-posix.o common-obj-$(CONFIG_LINUX) += fsdev/ common-obj-y += migration.o migration-tcp.o +common-obj-y += vmstate.o +common-obj-y += qemu-file.o common-obj-$(CONFIG_RDMA) += migration-rdma.o common-obj-y += qemu-char.o #aio.o common-obj-y += block-migration.o @@ -65,9 +66,11 @@ common-obj-y += hw/ common-obj-y += ui/ common-obj-y += bt-host.o bt-vhci.o +bt-host.o-cflags := $(BLUEZ_CFLAGS) common-obj-y += dma-helpers.o common-obj-y += vl.o +vl.o-cflags := $(GPROF_CFLAGS) $(SDL_CFLAGS) common-obj-y += tpm.o common-obj-$(CONFIG_SLIRP) += slirp/ @@ -109,17 +112,4 @@ version-lobj-$(CONFIG_WIN32) += $(BUILD_DIR)/version.lo # FIXME: a few definitions from qapi-types.o/qapi-visit.o are needed # by libqemuutil.a. These should be moved to a separate .json schema. qga-obj-y = qga/ qapi-types.o qapi-visit.o - -vl.o: QEMU_CFLAGS+=$(GPROF_CFLAGS) - -vl.o: QEMU_CFLAGS+=$(SDL_CFLAGS) - -QEMU_CFLAGS+=$(GLIB_CFLAGS) - -nested-vars += \ - stub-obj-y \ - util-obj-y \ - qga-obj-y \ - block-obj-y \ - common-obj-y -dummy := $(call unnest-vars) +qga-vss-dll-obj-y = qga/ diff --git a/Makefile.target b/Makefile.target index 9a4985213b..dd815bb1f2 100644 --- a/Makefile.target +++ b/Makefile.target @@ -16,19 +16,22 @@ QEMU_CFLAGS+=-I$(SRC_PATH)/include ifdef CONFIG_USER_ONLY # user emulator name QEMU_PROG=qemu-$(TARGET_NAME) +QEMU_PROG_BUILD = $(QEMU_PROG) else # system emulator name +QEMU_PROG=qemu-system-$(TARGET_NAME)$(EXESUF) ifneq (,$(findstring -mwindows,$(libs_softmmu))) # Terminate program name with a 'w' because the linker builds a windows executable. QEMU_PROGW=qemu-system-$(TARGET_NAME)w$(EXESUF) -endif # windows executable -QEMU_PROG=qemu-system-$(TARGET_NAME)$(EXESUF) +$(QEMU_PROG): $(QEMU_PROGW) + $(call quiet-command,$(OBJCOPY) --subsystem console $(QEMU_PROGW) $(QEMU_PROG)," GEN $(TARGET_DIR)$(QEMU_PROG)") +QEMU_PROG_BUILD = $(QEMU_PROGW) +else +QEMU_PROG_BUILD = $(QEMU_PROG) endif - -PROGS=$(QEMU_PROG) -ifdef QEMU_PROGW -PROGS+=$(QEMU_PROGW) endif + +PROGS=$(QEMU_PROG) $(QEMU_PROGW) STPFILES= config-target.h: config-target.h-timestamp @@ -70,10 +73,6 @@ all: $(PROGS) stap # Dummy command so that make thinks it has done something @true -CONFIG_NO_PCI = $(if $(subst n,,$(CONFIG_PCI)),n,y) -CONFIG_NO_KVM = $(if $(subst n,,$(CONFIG_KVM)),n,y) -CONFIG_NO_XEN = $(if $(subst n,,$(CONFIG_XEN)),n,y) - ######################################################### # cpu emulator library obj-y = exec.o translate-all.o cpu-exec.o @@ -83,8 +82,8 @@ obj-$(CONFIG_TCG_INTERPRETER) += disas/tci.o obj-y += fpu/softfloat.o obj-y += target-$(TARGET_BASE_ARCH)/ obj-y += disas.o -obj-$(CONFIG_GDBSTUB_XML) += gdbstub-xml.o -obj-$(CONFIG_NO_KVM) += kvm-stub.o +obj-$(call notempty,$(TARGET_XML_FILES)) += gdbstub-xml.o +obj-$(call lnot,$(CONFIG_KVM)) += kvm-stub.o ######################################################### # Linux user emulator target @@ -113,7 +112,7 @@ endif #CONFIG_BSD_USER ######################################################### # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o @@ -124,8 +123,10 @@ obj-y += dump.o LIBS+=$(libs_softmmu) # xen support -obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o -obj-$(CONFIG_NO_XEN) += xen-stub.o +obj-$(CONFIG_XEN) += xen-common.o +obj-$(CONFIG_XEN_I386) += xen-hvm.o xen-mapcache.o +obj-$(call lnot,$(CONFIG_XEN)) += xen-common-stub.o +obj-$(call lnot,$(CONFIG_XEN_I386)) += xen-hvm-stub.o # Hardware support ifeq ($(TARGET_NAME), sparc64) @@ -134,8 +135,6 @@ else obj-y += hw/$(TARGET_BASE_ARCH)/ endif -main.o: QEMU_CFLAGS+=$(GPROF_CFLAGS) - GENERATED_HEADERS += hmp-commands.h qmp-commands-old.h endif # CONFIG_SOFTMMU @@ -143,28 +142,27 @@ endif # CONFIG_SOFTMMU # Workaround for http://gcc.gnu.org/PR55489, see configure. %/translate.o: QEMU_CFLAGS += $(TRANSLATE_OPT_CFLAGS) -nested-vars += obj-y +dummy := $(call unnest-vars,,obj-y) +all-obj-y := $(obj-y) -# This resolves all nested paths, so it must come last +block-obj-y := +common-obj-y := include $(SRC_PATH)/Makefile.objs - -all-obj-y = $(obj-y) -all-obj-y += $(addprefix ../, $(common-obj-y)) +dummy := $(call unnest-vars,.., \ + block-obj-y \ + block-obj-m \ + common-obj-y \ + common-obj-m) +all-obj-y += $(common-obj-y) +all-obj-$(CONFIG_SOFTMMU) += $(block-obj-y) ifndef CONFIG_HAIKU LIBS+=-lm endif -ifdef QEMU_PROGW -# The linker builds a windows executable. Make also a console executable. -$(QEMU_PROGW): $(all-obj-y) ../libqemuutil.a ../libqemustub.a +# build either PROG or PROGW +$(QEMU_PROG_BUILD): $(all-obj-y) ../libqemuutil.a ../libqemustub.a $(call LINK,$^) -$(QEMU_PROG): $(QEMU_PROGW) - $(call quiet-command,$(OBJCOPY) --subsystem console $(QEMU_PROGW) $(QEMU_PROG)," GEN $(TARGET_DIR)$(QEMU_PROG)") -else -$(QEMU_PROG): $(all-obj-y) ../libqemuutil.a ../libqemustub.a - $(call LINK,$^) -endif gdbstub-xml.c: $(TARGET_XML_FILES) $(SRC_PATH)/scripts/feature_to_c.sh $(call quiet-command,rm -f $@ && $(SHELL) $(SRC_PATH)/scripts/feature_to_c.sh $@ $(TARGET_XML_FILES)," GEN $(TARGET_DIR)$@") @@ -185,9 +183,9 @@ endif install: all ifneq ($(PROGS),) - $(INSTALL) -m 755 $(PROGS) "$(DESTDIR)$(bindir)" + $(INSTALL_PROG) $(PROGS) "$(DESTDIR)$(bindir)" ifneq ($(STRIP),) - $(STRIP) $(patsubst %,"$(DESTDIR)$(bindir)/%",$(PROGS)) + $(STRIP) $(PROGS:%="$(DESTDIR)$(bindir)/%") endif endif ifdef CONFIG_TRACE_SYSTEMTAP diff --git a/QMP/README b/QMP/README deleted file mode 100644 index c95a08c234..0000000000 --- a/QMP/README +++ /dev/null @@ -1,88 +0,0 @@ - QEMU Monitor Protocol - ===================== - -Introduction -------------- - -The QEMU Monitor Protocol (QMP) allows applications to communicate with -QEMU's Monitor. - -QMP is JSON[1] based and currently has the following features: - -- Lightweight, text-based, easy to parse data format -- Asynchronous messages support (ie. events) -- Capabilities Negotiation - -For detailed information on QMP's usage, please, refer to the following files: - -o qmp-spec.txt QEMU Monitor Protocol current specification -o qmp-commands.txt QMP supported commands (auto-generated at build-time) -o qmp-events.txt List of available asynchronous events - -There is also a simple Python script called 'qmp-shell' available. - -IMPORTANT: It's strongly recommended to read the 'Stability Considerations' -section in the qmp-commands.txt file before making any serious use of QMP. - - -[1] http://www.json.org - -Usage ------ - -To enable QMP, you need a QEMU monitor instance in "control mode". There are -two ways of doing this. - -The simplest one is using the '-qmp' command-line option. The following -example makes QMP available on localhost port 4444: - - $ qemu [...] -qmp tcp:localhost:4444,server - -However, in order to have more complex combinations, like multiple monitors, -the '-mon' command-line option should be used along with the '-chardev' one. -For instance, the following example creates one user monitor on stdio and one -QMP monitor on localhost port 4444. - - $ qemu [...] -chardev stdio,id=mon0 -mon chardev=mon0,mode=readline \ - -chardev socket,id=mon1,host=localhost,port=4444,server \ - -mon chardev=mon1,mode=control - -Please, refer to QEMU's manpage for more information. - -Simple Testing --------------- - -To manually test QMP one can connect with telnet and issue commands by hand: - -$ telnet localhost 4444 -Trying 127.0.0.1... -Connected to localhost. -Escape character is '^]'. -{"QMP": {"version": {"qemu": {"micro": 50, "minor": 13, "major": 0}, "package": ""}, "capabilities": []}} -{ "execute": "qmp_capabilities" } -{"return": {}} -{ "execute": "query-version" } -{"return": {"qemu": {"micro": 50, "minor": 13, "major": 0}, "package": ""}} - -Development Process -------------------- - -When changing QMP's interface (by adding new commands, events or modifying -existing ones) it's mandatory to update the relevant documentation, which is -one (or more) of the files listed in the 'Introduction' section*. - -Also, it's strongly recommended to send the documentation patch first, before -doing any code change. This is so because: - - 1. Avoids the code dictating the interface - - 2. Review can improve your interface. Letting that happen before - you implement it can save you work. - -* The qmp-commands.txt file is generated from the qmp-commands.hx one, which - is the file that should be edited. - -Homepage --------- - -http://wiki.qemu.org/QMP diff --git a/README b/README index c77d12642d..c7c990d895 100644 --- a/README +++ b/README @@ -1,3 +1,3 @@ -Read the documentation in qemu-doc.html or on http://wiki.qemu.org +Read the documentation in qemu-doc.html or on http://wiki.qemu-project.org - QEMU team diff --git a/VERSION b/VERSION index daeb9f5f40..52be4db49f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.5.93 +2.0.50 diff --git a/aio-posix.c b/aio-posix.c index b68eccd40c..f921d4f538 100644 --- a/aio-posix.c +++ b/aio-posix.c @@ -23,7 +23,6 @@ struct AioHandler GPollFD pfd; IOHandler *io_read; IOHandler *io_write; - AioFlushHandler *io_flush; int deleted; int pollfds_idx; void *opaque; @@ -47,7 +46,6 @@ void aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *io_read, IOHandler *io_write, - AioFlushHandler *io_flush, void *opaque) { AioHandler *node; @@ -84,7 +82,6 @@ void aio_set_fd_handler(AioContext *ctx, /* Update handler with latest information */ node->io_read = io_read; node->io_write = io_write; - node->io_flush = io_flush; node->opaque = opaque; node->pollfds_idx = -1; @@ -97,12 +94,10 @@ void aio_set_fd_handler(AioContext *ctx, void aio_set_event_notifier(AioContext *ctx, EventNotifier *notifier, - EventNotifierHandler *io_read, - AioFlushEventNotifierHandler *io_flush) + EventNotifierHandler *io_read) { aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), - (IOHandler *)io_read, NULL, - (AioFlushHandler *)io_flush, notifier); + (IOHandler *)io_read, NULL, notifier); } bool aio_pending(AioContext *ctx) @@ -147,7 +142,11 @@ static bool aio_dispatch(AioContext *ctx) (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && node->io_read) { node->io_read(node->opaque); - progress = true; + + /* aio_notify() does not count as progress */ + if (node->opaque != &ctx->notifier) { + progress = true; + } } if (!node->deleted && (revents & (G_IO_OUT | G_IO_ERR)) && @@ -166,6 +165,10 @@ static bool aio_dispatch(AioContext *ctx) g_free(tmp); } } + + /* Run our timers */ + progress |= timerlistgroup_run_timers(&ctx->tlg); + return progress; } @@ -173,7 +176,7 @@ bool aio_poll(AioContext *ctx, bool blocking) { AioHandler *node; int ret; - bool busy, progress; + bool progress; progress = false; @@ -200,20 +203,8 @@ bool aio_poll(AioContext *ctx, bool blocking) g_array_set_size(ctx->pollfds, 0); /* fill pollfds */ - busy = false; QLIST_FOREACH(node, &ctx->aio_handlers, node) { node->pollfds_idx = -1; - - /* If there aren't pending AIO operations, don't invoke callbacks. - * Otherwise, if there are no AIO requests, qemu_aio_wait() would - * wait indefinitely. - */ - if (!node->deleted && node->io_flush) { - if (node->io_flush(node->opaque) == 0) { - continue; - } - busy = true; - } if (!node->deleted && node->pfd.events) { GPollFD pfd = { .fd = node->pfd.fd, @@ -226,15 +217,10 @@ bool aio_poll(AioContext *ctx, bool blocking) ctx->walking_handlers--; - /* No AIO operations? Get us out of here */ - if (!busy) { - return progress; - } - /* wait until next event */ - ret = g_poll((GPollFD *)ctx->pollfds->data, - ctx->pollfds->len, - blocking ? -1 : 0); + ret = qemu_poll_ns((GPollFD *)ctx->pollfds->data, + ctx->pollfds->len, + blocking ? timerlistgroup_deadline_ns(&ctx->tlg) : 0); /* if we have any readable fds, dispatch event */ if (ret > 0) { @@ -245,11 +231,12 @@ bool aio_poll(AioContext *ctx, bool blocking) node->pfd.revents = pfd->revents; } } - if (aio_dispatch(ctx)) { - progress = true; - } } - assert(progress || busy); - return true; + /* Run dispatch even if there were no readable fds to run timers */ + if (aio_dispatch(ctx)) { + progress = true; + } + + return progress; } diff --git a/aio-win32.c b/aio-win32.c index 38723bf1d3..23f4e5ba19 100644 --- a/aio-win32.c +++ b/aio-win32.c @@ -23,7 +23,6 @@ struct AioHandler { EventNotifier *e; EventNotifierHandler *io_notify; - AioFlushEventNotifierHandler *io_flush; GPollFD pfd; int deleted; QLIST_ENTRY(AioHandler) node; @@ -31,8 +30,7 @@ struct AioHandler { void aio_set_event_notifier(AioContext *ctx, EventNotifier *e, - EventNotifierHandler *io_notify, - AioFlushEventNotifierHandler *io_flush) + EventNotifierHandler *io_notify) { AioHandler *node; @@ -73,7 +71,6 @@ void aio_set_event_notifier(AioContext *ctx, } /* Update handler with latest information */ node->io_notify = io_notify; - node->io_flush = io_flush; } aio_notify(ctx); @@ -96,8 +93,9 @@ bool aio_poll(AioContext *ctx, bool blocking) { AioHandler *node; HANDLE events[MAXIMUM_WAIT_OBJECTS + 1]; - bool busy, progress; + bool progress; int count; + int timeout; progress = false; @@ -111,6 +109,9 @@ bool aio_poll(AioContext *ctx, bool blocking) progress = true; } + /* Run timers */ + progress |= timerlistgroup_run_timers(&ctx->tlg); + /* * Then dispatch any pending callbacks from the GSource. * @@ -126,7 +127,11 @@ bool aio_poll(AioContext *ctx, bool blocking) if (node->pfd.revents && node->io_notify) { node->pfd.revents = 0; node->io_notify(node->e); - progress = true; + + /* aio_notify() does not count as progress */ + if (node->e != &ctx->notifier) { + progress = true; + } } tmp = node; @@ -147,19 +152,8 @@ bool aio_poll(AioContext *ctx, bool blocking) ctx->walking_handlers++; /* fill fd sets */ - busy = false; count = 0; QLIST_FOREACH(node, &ctx->aio_handlers, node) { - /* If there aren't pending AIO operations, don't invoke callbacks. - * Otherwise, if there are no AIO requests, qemu_aio_wait() would - * wait indefinitely. - */ - if (!node->deleted && node->io_flush) { - if (node->io_flush(node->e) == 0) { - continue; - } - busy = true; - } if (!node->deleted && node->io_notify) { events[count++] = event_notifier_get_handle(node->e); } @@ -167,15 +161,13 @@ bool aio_poll(AioContext *ctx, bool blocking) ctx->walking_handlers--; - /* No AIO operations? Get us out of here */ - if (!busy) { - return progress; - } - /* wait until next event */ while (count > 0) { - int timeout = blocking ? INFINITE : 0; - int ret = WaitForMultipleObjects(count, events, FALSE, timeout); + int ret; + + timeout = blocking ? + qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg)) : 0; + ret = WaitForMultipleObjects(count, events, FALSE, timeout); /* if we have any signaled events, dispatch event */ if ((DWORD) (ret - WAIT_OBJECT_0) >= count) { @@ -196,7 +188,11 @@ bool aio_poll(AioContext *ctx, bool blocking) event_notifier_get_handle(node->e) == events[ret - WAIT_OBJECT_0] && node->io_notify) { node->io_notify(node->e); - progress = true; + + /* aio_notify() does not count as progress */ + if (node->e != &ctx->notifier) { + progress = true; + } } tmp = node; @@ -214,6 +210,14 @@ bool aio_poll(AioContext *ctx, bool blocking) events[ret - WAIT_OBJECT_0] = events[--count]; } - assert(progress || busy); - return true; + if (blocking) { + /* Run the timers a second time. We do this because otherwise aio_wait + * will not note progress - and will stop a drain early - if we have + * a timer that was not ready to run entering g_poll but is ready + * after g_poll. This will only do anything if a timer has expired. + */ + progress |= timerlistgroup_run_timers(&ctx->tlg); + } + + return progress; } diff --git a/arch_init.c b/arch_init.c index 68a7ab784f..685ba0e268 100644 --- a/arch_init.c +++ b/arch_init.c @@ -45,10 +45,13 @@ #include "hw/audio/pcspk.h" #include "migration/page_cache.h" #include "qemu/config-file.h" +#include "qemu/error-report.h" #include "qmp-commands.h" #include "trace.h" #include "exec/cpu-all.h" +#include "exec/ram_addr.h" #include "hw/acpi/acpi.h" +#include "qemu/host-utils.h" #ifdef DEBUG_ARCH_INIT #define DPRINTF(fmt, ...) \ @@ -108,6 +111,8 @@ static bool mig_throttle_on; static int dirty_rate_high_cnt; static void check_guest_throttling(void); +static uint64_t bitmap_sync_count; + /***********************************************************/ /* ram save/restore */ @@ -120,7 +125,6 @@ static void check_guest_throttling(void); #define RAM_SAVE_FLAG_XBZRLE 0x40 /* 0x80 is reserved in migration.h start with 0x100 next */ - static struct defconfig_file { const char *filename; /* Indicates it is an user config file (disabled by -no-user-config) */ @@ -131,6 +135,7 @@ static struct defconfig_file { { NULL }, /* end of list */ }; +static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE]; int qemu_read_default_config_files(bool userconfig) { @@ -150,10 +155,9 @@ int qemu_read_default_config_files(bool userconfig) return 0; } -static inline bool is_zero_page(uint8_t *p) +static inline bool is_zero_range(uint8_t *p, uint64_t size) { - return buffer_find_nonzero_offset(p, TARGET_PAGE_SIZE) == - TARGET_PAGE_SIZE; + return buffer_find_nonzero_offset(p, size) == size; } /* struct contains XBZRLE cache and a static page @@ -163,25 +167,64 @@ static struct { uint8_t *encoded_buf; /* buffer for storing page content */ uint8_t *current_buf; - /* buffer used for XBZRLE decoding */ - uint8_t *decoded_buf; - /* Cache for XBZRLE */ + /* Cache for XBZRLE, Protected by lock. */ PageCache *cache; -} XBZRLE = { - .encoded_buf = NULL, - .current_buf = NULL, - .decoded_buf = NULL, - .cache = NULL, -}; + QemuMutex lock; +} XBZRLE; +/* buffer used for XBZRLE decoding */ +static uint8_t *xbzrle_decoded_buf; + +static void XBZRLE_cache_lock(void) +{ + if (migrate_use_xbzrle()) + qemu_mutex_lock(&XBZRLE.lock); +} +static void XBZRLE_cache_unlock(void) +{ + if (migrate_use_xbzrle()) + qemu_mutex_unlock(&XBZRLE.lock); +} + +/* + * called from qmp_migrate_set_cache_size in main thread, possibly while + * a migration is in progress. + * A running migration maybe using the cache and might finish during this + * call, hence changes to the cache are protected by XBZRLE.lock(). + */ int64_t xbzrle_cache_resize(int64_t new_size) { + PageCache *new_cache; + int64_t ret; + + if (new_size < TARGET_PAGE_SIZE) { + return -1; + } + + XBZRLE_cache_lock(); + if (XBZRLE.cache != NULL) { - return cache_resize(XBZRLE.cache, new_size / TARGET_PAGE_SIZE) * - TARGET_PAGE_SIZE; + if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { + goto out_new_size; + } + new_cache = cache_init(new_size / TARGET_PAGE_SIZE, + TARGET_PAGE_SIZE); + if (!new_cache) { + error_report("Error creating cache"); + ret = -1; + goto out; + } + + cache_fini(XBZRLE.cache); + XBZRLE.cache = new_cache; } - return pow2floor(new_size); + +out_new_size: + ret = pow2floor(new_size); +out: + XBZRLE_cache_unlock(); + return ret; } /* accounting for migration statistics */ @@ -193,6 +236,7 @@ typedef struct AccountingInfo { uint64_t xbzrle_bytes; uint64_t xbzrle_pages; uint64_t xbzrle_cache_miss; + double xbzrle_cache_miss_rate; uint64_t xbzrle_overflows; } AccountingInfo; @@ -248,6 +292,11 @@ uint64_t xbzrle_mig_pages_cache_miss(void) return acct_info.xbzrle_cache_miss; } +double xbzrle_mig_cache_miss_rate(void) +{ + return acct_info.xbzrle_cache_miss_rate; +} + uint64_t xbzrle_mig_pages_overflow(void) { return acct_info.xbzrle_overflows; @@ -270,9 +319,37 @@ static size_t save_block_hdr(QEMUFile *f, RAMBlock *block, ram_addr_t offset, return size; } +/* This is the last block that we have visited serching for dirty pages + */ +static RAMBlock *last_seen_block; +/* This is the last block from where we have sent data */ +static RAMBlock *last_sent_block; +static ram_addr_t last_offset; +static unsigned long *migration_bitmap; +static uint64_t migration_dirty_pages; +static uint32_t last_version; +static bool ram_bulk_stage; + +/* Update the xbzrle cache to reflect a page that's been sent as all 0. + * The important thing is that a stale (not-yet-0'd) page be replaced + * by the new data. + * As a bonus, if the page wasn't in the cache it gets added so that + * when a small write is made into the 0'd page it gets XBZRLE sent + */ +static void xbzrle_cache_zero_page(ram_addr_t current_addr) +{ + if (ram_bulk_stage || !migrate_use_xbzrle()) { + return; + } + + /* We don't care if this fails to allocate a new cache page + * as long as it updated an old one */ + cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE); +} + #define ENCODING_FLAG_XBZRLE 0x1 -static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, +static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, ram_addr_t current_addr, RAMBlock *block, ram_addr_t offset, int cont, bool last_stage) { @@ -280,17 +357,23 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, uint8_t *prev_cached_page; if (!cache_is_cached(XBZRLE.cache, current_addr)) { + acct_info.xbzrle_cache_miss++; if (!last_stage) { - cache_insert(XBZRLE.cache, current_addr, current_data); + if (cache_insert(XBZRLE.cache, current_addr, *current_data) == -1) { + return -1; + } else { + /* update *current_data when the page has been + inserted into cache */ + *current_data = get_cached_data(XBZRLE.cache, current_addr); + } } - acct_info.xbzrle_cache_miss++; return -1; } prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); /* save current buffer into memory */ - memcpy(XBZRLE.current_buf, current_data, TARGET_PAGE_SIZE); + memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); /* XBZRLE encoding (if there is no overflow) */ encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, @@ -303,7 +386,10 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, DPRINTF("Overflow\n"); acct_info.xbzrle_overflows++; /* update data in the cache */ - memcpy(prev_cached_page, current_data, TARGET_PAGE_SIZE); + if (!last_stage) { + memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); + *current_data = prev_cached_page; + } return -1; } @@ -324,25 +410,14 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t *current_data, return bytes_sent; } - -/* This is the last block that we have visited serching for dirty pages - */ -static RAMBlock *last_seen_block; -/* This is the last block from where we have sent data */ -static RAMBlock *last_sent_block; -static ram_addr_t last_offset; -static unsigned long *migration_bitmap; -static uint64_t migration_dirty_pages; -static uint32_t last_version; -static bool ram_bulk_stage; - static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, ram_addr_t start) { unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; unsigned long nr = base + (start >> TARGET_PAGE_BITS); - unsigned long size = base + (int128_get64(mr->size) >> TARGET_PAGE_BITS); + uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr)); + unsigned long size = base + (mr_size >> TARGET_PAGE_BITS); unsigned long next; @@ -359,11 +434,10 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, return (next - base) << TARGET_PAGE_BITS; } -static inline bool migration_bitmap_set_dirty(MemoryRegion *mr, - ram_addr_t offset) +static inline bool migration_bitmap_set_dirty(ram_addr_t addr) { bool ret; - int nr = (mr->ram_addr + offset) >> TARGET_PAGE_BITS; + int nr = addr >> TARGET_PAGE_BITS; ret = test_and_set_bit(nr, migration_bitmap); @@ -373,12 +447,47 @@ static inline bool migration_bitmap_set_dirty(MemoryRegion *mr, return ret; } +static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) +{ + ram_addr_t addr; + unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS); + + /* start address is aligned at the start of a word? */ + if (((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) { + int k; + int nr = BITS_TO_LONGS(length >> TARGET_PAGE_BITS); + unsigned long *src = ram_list.dirty_memory[DIRTY_MEMORY_MIGRATION]; + + for (k = page; k < page + nr; k++) { + if (src[k]) { + unsigned long new_dirty; + new_dirty = ~migration_bitmap[k]; + migration_bitmap[k] |= src[k]; + new_dirty &= src[k]; + migration_dirty_pages += ctpopl(new_dirty); + src[k] = 0; + } + } + } else { + for (addr = 0; addr < length; addr += TARGET_PAGE_SIZE) { + if (cpu_physical_memory_get_dirty(start + addr, + TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION)) { + cpu_physical_memory_reset_dirty(start + addr, + TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION); + migration_bitmap_set_dirty(start + addr); + } + } + } +} + + /* Needs iothread lock! */ static void migration_bitmap_sync(void) { RAMBlock *block; - ram_addr_t addr; uint64_t num_dirty_pages_init = migration_dirty_pages; MigrationState *s = migrate_get_current(); static int64_t start_time; @@ -386,31 +495,29 @@ static void migration_bitmap_sync(void) static int64_t num_dirty_pages_period; int64_t end_time; int64_t bytes_xfer_now; + static uint64_t xbzrle_cache_miss_prev; + static uint64_t iterations_prev; + + bitmap_sync_count++; if (!bytes_xfer_prev) { bytes_xfer_prev = ram_bytes_transferred(); } if (!start_time) { - start_time = qemu_get_clock_ms(rt_clock); + start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); } trace_migration_bitmap_sync_start(); address_space_sync_dirty_bitmap(&address_space_memory); QTAILQ_FOREACH(block, &ram_list.blocks, next) { - for (addr = 0; addr < block->length; addr += TARGET_PAGE_SIZE) { - if (memory_region_test_and_clear_dirty(block->mr, - addr, TARGET_PAGE_SIZE, - DIRTY_MEMORY_MIGRATION)) { - migration_bitmap_set_dirty(block->mr, addr); - } - } + migration_bitmap_sync_range(block->mr->ram_addr, block->length); } trace_migration_bitmap_sync_end(migration_dirty_pages - num_dirty_pages_init); num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; - end_time = qemu_get_clock_ms(rt_clock); + end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); /* more than 1 second = 1000 millisecons */ if (end_time > start_time + 1000) { @@ -433,29 +540,113 @@ static void migration_bitmap_sync(void) } else { mig_throttle_on = false; } + if (migrate_use_xbzrle()) { + if (iterations_prev != 0) { + acct_info.xbzrle_cache_miss_rate = + (double)(acct_info.xbzrle_cache_miss - + xbzrle_cache_miss_prev) / + (acct_info.iterations - iterations_prev); + } + iterations_prev = acct_info.iterations; + xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss; + } s->dirty_pages_rate = num_dirty_pages_period * 1000 / (end_time - start_time); s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; start_time = end_time; num_dirty_pages_period = 0; + s->dirty_sync_count = bitmap_sync_count; + } +} + +/* + * ram_save_page: Send the given page to the stream + * + * Returns: Number of bytes written. + */ +static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset, + bool last_stage) +{ + int bytes_sent; + int cont; + ram_addr_t current_addr; + MemoryRegion *mr = block->mr; + uint8_t *p; + int ret; + bool send_async = true; + + cont = (block == last_sent_block) ? RAM_SAVE_FLAG_CONTINUE : 0; + + p = memory_region_get_ram_ptr(mr) + offset; + + /* In doubt sent page as normal */ + bytes_sent = -1; + ret = ram_control_save_page(f, block->offset, + offset, TARGET_PAGE_SIZE, &bytes_sent); + + XBZRLE_cache_lock(); + + current_addr = block->offset + offset; + if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { + if (ret != RAM_SAVE_CONTROL_DELAYED) { + if (bytes_sent > 0) { + acct_info.norm_pages++; + } else if (bytes_sent == 0) { + acct_info.dup_pages++; + } + } + } else if (is_zero_range(p, TARGET_PAGE_SIZE)) { + acct_info.dup_pages++; + bytes_sent = save_block_hdr(f, block, offset, cont, + RAM_SAVE_FLAG_COMPRESS); + qemu_put_byte(f, 0); + bytes_sent++; + /* Must let xbzrle know, otherwise a previous (now 0'd) cached + * page would be stale + */ + xbzrle_cache_zero_page(current_addr); + } else if (!ram_bulk_stage && migrate_use_xbzrle()) { + bytes_sent = save_xbzrle_page(f, &p, current_addr, block, + offset, cont, last_stage); + if (!last_stage) { + /* Can't send this cached data async, since the cache page + * might get updated before it gets to the wire + */ + send_async = false; + } + } + + /* XBZRLE overflow or normal page */ + if (bytes_sent == -1) { + bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE); + if (send_async) { + qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); + } else { + qemu_put_buffer(f, p, TARGET_PAGE_SIZE); + } + bytes_sent += TARGET_PAGE_SIZE; + acct_info.norm_pages++; } + + XBZRLE_cache_unlock(); + + return bytes_sent; } /* - * ram_save_block: Writes a page of memory to the stream f + * ram_find_and_save_block: Finds a page to send and sends it to f * * Returns: The number of bytes written. * 0 means no dirty pages */ -static int ram_save_block(QEMUFile *f, bool last_stage) +static int ram_find_and_save_block(QEMUFile *f, bool last_stage) { RAMBlock *block = last_seen_block; ram_addr_t offset = last_offset; bool complete_round = false; int bytes_sent = 0; MemoryRegion *mr; - ram_addr_t current_addr; if (!block) block = QTAILQ_FIRST(&ram_list.blocks); @@ -476,48 +667,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) ram_bulk_stage = false; } } else { - int ret; - uint8_t *p; - int cont = (block == last_sent_block) ? - RAM_SAVE_FLAG_CONTINUE : 0; - - p = memory_region_get_ram_ptr(mr) + offset; - - /* In doubt sent page as normal */ - bytes_sent = -1; - ret = ram_control_save_page(f, block->offset, - offset, TARGET_PAGE_SIZE, &bytes_sent); - - if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { - if (ret != RAM_SAVE_CONTROL_DELAYED) { - if (bytes_sent > 0) { - acct_info.norm_pages++; - } else if (bytes_sent == 0) { - acct_info.dup_pages++; - } - } - } else if (is_zero_page(p)) { - acct_info.dup_pages++; - bytes_sent = save_block_hdr(f, block, offset, cont, - RAM_SAVE_FLAG_COMPRESS); - qemu_put_byte(f, 0); - bytes_sent++; - } else if (!ram_bulk_stage && migrate_use_xbzrle()) { - current_addr = block->offset + offset; - bytes_sent = save_xbzrle_page(f, p, current_addr, block, - offset, cont, last_stage); - if (!last_stage) { - p = get_cached_data(XBZRLE.cache, current_addr); - } - } - - /* XBZRLE overflow or normal page */ - if (bytes_sent == -1) { - bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE); - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); - bytes_sent += TARGET_PAGE_SIZE; - acct_info.norm_pages++; - } + bytes_sent = ram_save_page(f, block, offset, last_stage); /* if page is unmodified, continue to the next */ if (bytes_sent > 0) { @@ -572,6 +722,12 @@ uint64_t ram_bytes_total(void) return total; } +void free_xbzrle_decoded_buf(void) +{ + g_free(xbzrle_decoded_buf); + xbzrle_decoded_buf = NULL; +} + static void migration_end(void) { if (migration_bitmap) { @@ -580,14 +736,17 @@ static void migration_end(void) migration_bitmap = NULL; } + XBZRLE_cache_lock(); if (XBZRLE.cache) { cache_fini(XBZRLE.cache); g_free(XBZRLE.cache); g_free(XBZRLE.encoded_buf); g_free(XBZRLE.current_buf); - g_free(XBZRLE.decoded_buf); XBZRLE.cache = NULL; + XBZRLE.encoded_buf = NULL; + XBZRLE.current_buf = NULL; } + XBZRLE_cache_unlock(); } static void ram_migration_cancel(void *opaque) @@ -609,24 +768,39 @@ static void reset_ram_globals(void) static int ram_save_setup(QEMUFile *f, void *opaque) { RAMBlock *block; - int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; + int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ - migration_bitmap = bitmap_new(ram_pages); - bitmap_set(migration_bitmap, 0, ram_pages); - migration_dirty_pages = ram_pages; mig_throttle_on = false; dirty_rate_high_cnt = 0; + bitmap_sync_count = 0; if (migrate_use_xbzrle()) { + XBZRLE_cache_lock(); XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / TARGET_PAGE_SIZE, TARGET_PAGE_SIZE); if (!XBZRLE.cache) { - DPRINTF("Error creating cache\n"); + XBZRLE_cache_unlock(); + error_report("Error creating cache"); + return -1; + } + XBZRLE_cache_unlock(); + + /* We prefer not to abort if there is no memory */ + XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); + if (!XBZRLE.encoded_buf) { + error_report("Error allocating encoded_buf"); return -1; } - XBZRLE.encoded_buf = g_malloc0(TARGET_PAGE_SIZE); - XBZRLE.current_buf = g_malloc(TARGET_PAGE_SIZE); + + XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); + if (!XBZRLE.current_buf) { + error_report("Error allocating current_buf"); + g_free(XBZRLE.encoded_buf); + XBZRLE.encoded_buf = NULL; + return -1; + } + acct_clear(); } @@ -635,6 +809,22 @@ static int ram_save_setup(QEMUFile *f, void *opaque) bytes_transferred = 0; reset_ram_globals(); + ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; + migration_bitmap = bitmap_new(ram_bitmap_pages); + bitmap_set(migration_bitmap, 0, ram_bitmap_pages); + + /* + * Count the total number of pages used by ram blocks not including any + * gaps due to alignment or unplugs. + */ + migration_dirty_pages = 0; + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + uint64_t block_pages; + + block_pages = block->length >> TARGET_PAGE_BITS; + migration_dirty_pages += block_pages; + } + memory_global_dirty_log_start(); migration_bitmap_sync(); qemu_mutex_unlock_iothread(); @@ -672,12 +862,12 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) ram_control_before_iterate(f, RAM_CONTROL_ROUND); - t0 = qemu_get_clock_ns(rt_clock); + t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); i = 0; while ((ret = qemu_file_rate_limit(f)) == 0) { int bytes_sent; - bytes_sent = ram_save_block(f, false); + bytes_sent = ram_find_and_save_block(f, false); /* no more blocks to sent */ if (bytes_sent == 0) { break; @@ -691,7 +881,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) iterations */ if ((i & 63) == 0) { - uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000; + uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; if (t1 > MAX_WAIT) { DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n", t1, i); @@ -709,15 +899,20 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) */ ram_control_after_iterate(f, RAM_CONTROL_ROUND); + bytes_transferred += total_sent; + + /* + * Do not count these 8 bytes into total_sent, so that we can + * return 0 if no page had been dirtied. + */ + qemu_put_be64(f, RAM_SAVE_FLAG_EOS); + bytes_transferred += 8; + + ret = qemu_file_get_error(f); if (ret < 0) { - bytes_transferred += total_sent; return ret; } - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); - total_sent += 8; - bytes_transferred += total_sent; - return total_sent; } @@ -734,7 +929,7 @@ static int ram_save_complete(QEMUFile *f, void *opaque) while (true) { int bytes_sent; - bytes_sent = ram_save_block(f, true); + bytes_sent = ram_find_and_save_block(f, true); /* no more blocks to sent */ if (bytes_sent == 0) { break; @@ -768,12 +963,11 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size) static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) { - int ret, rc = 0; unsigned int xh_len; int xh_flags; - if (!XBZRLE.decoded_buf) { - XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE); + if (!xbzrle_decoded_buf) { + xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); } /* extract RLE header */ @@ -790,21 +984,16 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) return -1; } /* load data and decode */ - qemu_get_buffer(f, XBZRLE.decoded_buf, xh_len); + qemu_get_buffer(f, xbzrle_decoded_buf, xh_len); /* decode RLE */ - ret = xbzrle_decode_buffer(XBZRLE.decoded_buf, xh_len, host, - TARGET_PAGE_SIZE); - if (ret == -1) { + if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host, + TARGET_PAGE_SIZE) == -1) { fprintf(stderr, "Failed to load XBZRLE page - decode error!\n"); - rc = -1; - } else if (ret > TARGET_PAGE_SIZE) { - fprintf(stderr, "Failed to load XBZRLE page - size %d exceeds %d!\n", - ret, TARGET_PAGE_SIZE); - abort(); + return -1; } - return rc; + return 0; } static inline void *host_from_stream_offset(QEMUFile *f, @@ -843,15 +1032,8 @@ static inline void *host_from_stream_offset(QEMUFile *f, */ void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) { - if (ch != 0 || !is_zero_page(host)) { + if (ch != 0 || !is_zero_range(host, size)) { memset(host, ch, size); -#ifndef _WIN32 - if (ch == 0 && - (!kvm_enabled() || kvm_has_sync_mmu()) && - getpagesize() <= TARGET_PAGE_SIZE) { - qemu_madvise(host, TARGET_PAGE_SIZE, QEMU_MADV_DONTNEED); - } -#endif } } @@ -864,8 +1046,9 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) seq_iter++; - if (version_id < 4 || version_id > 4) { - return -EINVAL; + if (version_id != 4) { + ret = -EINVAL; + goto done; } do { @@ -875,44 +1058,42 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) addr &= TARGET_PAGE_MASK; if (flags & RAM_SAVE_FLAG_MEM_SIZE) { - if (version_id == 4) { - /* Synchronize RAM block list */ - char id[256]; - ram_addr_t length; - ram_addr_t total_ram_bytes = addr; - - while (total_ram_bytes) { - RAMBlock *block; - uint8_t len; - - len = qemu_get_byte(f); - qemu_get_buffer(f, (uint8_t *)id, len); - id[len] = 0; - length = qemu_get_be64(f); - - QTAILQ_FOREACH(block, &ram_list.blocks, next) { - if (!strncmp(id, block->idstr, sizeof(id))) { - if (block->length != length) { - fprintf(stderr, - "Length mismatch: %s: " RAM_ADDR_FMT - " in != " RAM_ADDR_FMT "\n", id, length, - block->length); - ret = -EINVAL; - goto done; - } - break; + /* Synchronize RAM block list */ + char id[256]; + ram_addr_t length; + ram_addr_t total_ram_bytes = addr; + + while (total_ram_bytes) { + RAMBlock *block; + uint8_t len; + + len = qemu_get_byte(f); + qemu_get_buffer(f, (uint8_t *)id, len); + id[len] = 0; + length = qemu_get_be64(f); + + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + if (!strncmp(id, block->idstr, sizeof(id))) { + if (block->length != length) { + fprintf(stderr, + "Length mismatch: %s: " RAM_ADDR_FMT + " in != " RAM_ADDR_FMT "\n", id, length, + block->length); + ret = -EINVAL; + goto done; } + break; } + } - if (!block) { - fprintf(stderr, "Unknown ramblock \"%s\", cannot " - "accept migration\n", id); - ret = -EINVAL; - goto done; - } - - total_ram_bytes -= length; + if (!block) { + fprintf(stderr, "Unknown ramblock \"%s\", cannot " + "accept migration\n", id); + ret = -EINVAL; + goto done; } + + total_ram_bytes -= length; } } @@ -922,7 +1103,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) host = host_from_stream_offset(f, addr, flags); if (!host) { - return -EINVAL; + ret = -EINVAL; + goto done; } ch = qemu_get_byte(f); @@ -932,14 +1114,16 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) host = host_from_stream_offset(f, addr, flags); if (!host) { - return -EINVAL; + ret = -EINVAL; + goto done; } qemu_get_buffer(f, host, TARGET_PAGE_SIZE); } else if (flags & RAM_SAVE_FLAG_XBZRLE) { void *host = host_from_stream_offset(f, addr, flags); if (!host) { - return -EINVAL; + ret = -EINVAL; + goto done; } if (load_xbzrle(f, addr, host) < 0) { @@ -962,7 +1146,7 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) return ret; } -SaveVMHandlers savevm_ram_handlers = { +static SaveVMHandlers savevm_ram_handlers = { .save_live_setup = ram_save_setup, .save_live_iterate = ram_save_iterate, .save_live_complete = ram_save_complete, @@ -971,6 +1155,12 @@ SaveVMHandlers savevm_ram_handlers = { .cancel = ram_migration_cancel, }; +void ram_mig_init(void) +{ + qemu_mutex_init(&XBZRLE.lock); + register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); +} + struct soundhw { const char *name; const char *descr; @@ -1112,9 +1302,6 @@ int qemu_uuid_parse(const char *str, uint8_t *uuid) if (ret != 16) { return -1; } -#ifdef TARGET_I386 - smbios_add_field(1, offsetof(struct smbios_type_1, uuid), uuid, 16); -#endif return 0; } @@ -1125,20 +1312,18 @@ void do_acpitable_option(const QemuOpts *opts) acpi_table_add(opts, &err); if (err) { - fprintf(stderr, "Wrong acpi table provided: %s\n", - error_get_pretty(err)); + error_report("Wrong acpi table provided: %s", + error_get_pretty(err)); error_free(err); exit(1); } #endif } -void do_smbios_option(const char *optarg) +void do_smbios_option(QemuOpts *opts) { #ifdef TARGET_I386 - if (smbios_entry_add(optarg) < 0) { - exit(1); - } + smbios_entry_add(opts); #endif } @@ -1195,15 +1380,14 @@ static void mig_sleep_cpu(void *opq) much time in the VM. The migration thread will try to catchup. Workload will experience a performance drop. */ -static void mig_throttle_cpu_down(CPUState *cpu, void *data) -{ - async_run_on_cpu(cpu, mig_sleep_cpu, NULL); -} - static void mig_throttle_guest_down(void) { + CPUState *cpu; + qemu_mutex_lock_iothread(); - qemu_for_each_cpu(mig_throttle_cpu_down, NULL); + CPU_FOREACH(cpu) { + async_run_on_cpu(cpu, mig_sleep_cpu, NULL); + } qemu_mutex_unlock_iothread(); } @@ -1217,11 +1401,11 @@ static void check_guest_throttling(void) } if (!t0) { - t0 = qemu_get_clock_ns(rt_clock); + t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); return; } - t1 = qemu_get_clock_ns(rt_clock); + t1 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); /* If it has been more than 40 ms since the last time the guest * was throttled then do it again. diff --git a/async.c b/async.c index 5ce3633010..6930185e64 100644 --- a/async.c +++ b/async.c @@ -150,7 +150,10 @@ aio_ctx_prepare(GSource *source, gint *timeout) { AioContext *ctx = (AioContext *) source; QEMUBH *bh; + int deadline; + /* We assume there is no timeout already supplied */ + *timeout = -1; for (bh = ctx->first_bh; bh; bh = bh->next) { if (!bh->deleted && bh->scheduled) { if (bh->idle) { @@ -166,6 +169,14 @@ aio_ctx_prepare(GSource *source, gint *timeout) } } + deadline = qemu_timeout_ns_to_ms(timerlistgroup_deadline_ns(&ctx->tlg)); + if (deadline == 0) { + *timeout = 0; + return true; + } else { + *timeout = qemu_soonest_timeout(*timeout, deadline); + } + return false; } @@ -180,7 +191,7 @@ aio_ctx_check(GSource *source) return true; } } - return aio_pending(ctx); + return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0); } static gboolean @@ -201,10 +212,12 @@ aio_ctx_finalize(GSource *source) AioContext *ctx = (AioContext *) source; thread_pool_free(ctx->thread_pool); - aio_set_event_notifier(ctx, &ctx->notifier, NULL, NULL); + aio_set_event_notifier(ctx, &ctx->notifier, NULL); event_notifier_cleanup(&ctx->notifier); + rfifolock_destroy(&ctx->lock); qemu_mutex_destroy(&ctx->bh_lock); g_array_free(ctx->pollfds, TRUE); + timerlistgroup_deinit(&ctx->tlg); } static GSourceFuncs aio_source_funcs = { @@ -233,6 +246,17 @@ void aio_notify(AioContext *ctx) event_notifier_set(&ctx->notifier); } +static void aio_timerlist_notify(void *opaque) +{ + aio_notify(opaque); +} + +static void aio_rfifolock_cb(void *opaque) +{ + /* Kick owner thread in case they are blocked in aio_poll() */ + aio_notify(opaque); +} + AioContext *aio_context_new(void) { AioContext *ctx; @@ -240,10 +264,12 @@ AioContext *aio_context_new(void) ctx->pollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD)); ctx->thread_pool = NULL; qemu_mutex_init(&ctx->bh_lock); + rfifolock_init(&ctx->lock, aio_rfifolock_cb, ctx); event_notifier_init(&ctx->notifier, false); aio_set_event_notifier(ctx, &ctx->notifier, (EventNotifierHandler *) - event_notifier_test_and_clear, NULL); + event_notifier_test_and_clear); + timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); return ctx; } @@ -257,3 +283,13 @@ void aio_context_unref(AioContext *ctx) { g_source_unref(&ctx->source); } + +void aio_context_acquire(AioContext *ctx) +{ + rfifolock_lock(&ctx->lock); +} + +void aio_context_release(AioContext *ctx) +{ + rfifolock_unlock(&ctx->lock); +} diff --git a/audio/Makefile.objs b/audio/Makefile.objs index d71a877249..26a0ac9507 100644 --- a/audio/Makefile.objs +++ b/audio/Makefile.objs @@ -14,4 +14,4 @@ common-obj-$(CONFIG_AUDIO_WIN_INT) += audio_win_int.o common-obj-y += wavcapture.o $(obj)/audio.o $(obj)/fmodaudio.o: QEMU_CFLAGS += $(FMOD_CFLAGS) -$(obj)/sdlaudio.o: QEMU_CFLAGS += $(SDL_CFLAGS) +sdlaudio.o-cflags := $(SDL_CFLAGS) diff --git a/audio/audio.c b/audio/audio.c index 02bb8861f8..9d018e9ded 100644 --- a/audio/audio.c +++ b/audio/audio.c @@ -95,7 +95,7 @@ static struct { } }, - .period = { .hertz = 250 }, + .period = { .hertz = 100 }, .plive = 0, .log_to_monitor = 0, .try_poll_in = 1, @@ -1124,10 +1124,11 @@ static int audio_is_timer_needed (void) static void audio_reset_timer (AudioState *s) { if (audio_is_timer_needed ()) { - qemu_mod_timer (s->ts, qemu_get_clock_ns (vm_clock) + 1); + timer_mod (s->ts, + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + conf.period.ticks); } else { - qemu_del_timer (s->ts); + timer_del (s->ts); } } @@ -1811,8 +1812,7 @@ static const VMStateDescription vmstate_audio = { .name = "audio", .version_id = 1, .minimum_version_id = 1, - .minimum_version_id_old = 1, - .fields = (VMStateField []) { + .fields = (VMStateField[]) { VMSTATE_END_OF_LIST() } }; @@ -1834,7 +1834,7 @@ static void audio_init (void) QLIST_INIT (&s->cap_head); atexit (audio_atexit); - s->ts = qemu_new_timer_ns (vm_clock, audio_timer, s); + s->ts = timer_new_ns(QEMU_CLOCK_VIRTUAL, audio_timer, s); if (!s->ts) { hw_error("Could not create audio timer\n"); } diff --git a/audio/mixeng.c b/audio/mixeng.c index 02a9d9fb92..0e4976f271 100644 --- a/audio/mixeng.c +++ b/audio/mixeng.c @@ -348,7 +348,6 @@ void mixeng_clear (struct st_sample *buf, int len) void mixeng_volume (struct st_sample *buf, int len, struct mixeng_volume *vol) { -#ifdef CONFIG_MIXEMU if (vol->mute) { mixeng_clear (buf, len); return; @@ -364,9 +363,4 @@ void mixeng_volume (struct st_sample *buf, int len, struct mixeng_volume *vol) #endif buf += 1; } -#else - (void) buf; - (void) len; - (void) vol; -#endif } diff --git a/audio/mixeng_template.h b/audio/mixeng_template.h index 30849a62a1..77cc89b9e8 100644 --- a/audio/mixeng_template.h +++ b/audio/mixeng_template.h @@ -35,7 +35,7 @@ #define IN_T glue (glue (ITYPE, BSIZE), _t) #ifdef FLOAT_MIXENG -static mixeng_real inline glue (conv_, ET) (IN_T v) +static inline mixeng_real glue (conv_, ET) (IN_T v) { IN_T nv = ENDIAN_CONVERT (v); @@ -54,7 +54,7 @@ static mixeng_real inline glue (conv_, ET) (IN_T v) #endif } -static IN_T inline glue (clip_, ET) (mixeng_real v) +static inline IN_T glue (clip_, ET) (mixeng_real v) { if (v >= 0.5) { return IN_MAX; diff --git a/audio/noaudio.c b/audio/noaudio.c index 9f23aa2cb3..cb386620ae 100644 --- a/audio/noaudio.c +++ b/audio/noaudio.c @@ -46,7 +46,7 @@ static int no_run_out (HWVoiceOut *hw, int live) int64_t ticks; int64_t bytes; - now = qemu_get_clock_ns (vm_clock); + now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); ticks = now - no->old_ticks; bytes = muldiv64 (ticks, hw->info.bytes_per_second, get_ticks_per_sec ()); bytes = audio_MIN (bytes, INT_MAX); @@ -102,7 +102,7 @@ static int no_run_in (HWVoiceIn *hw) int samples = 0; if (dead) { - int64_t now = qemu_get_clock_ns (vm_clock); + int64_t now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); int64_t ticks = now - no->old_ticks; int64_t bytes = muldiv64 (ticks, hw->info.bytes_per_second, get_ticks_per_sec ()); diff --git a/audio/ossaudio.c b/audio/ossaudio.c index 007c64115a..5a73716032 100644 --- a/audio/ossaudio.c +++ b/audio/ossaudio.c @@ -849,6 +849,10 @@ static int oss_ctl_in (HWVoiceIn *hw, int cmd, ...) static void *oss_audio_init (void) { + if (access(conf.devpath_in, R_OK | W_OK) < 0 || + access(conf.devpath_out, R_OK | W_OK) < 0) { + return NULL; + } return &conf; } diff --git a/audio/paaudio.c b/audio/paaudio.c index 8b69778ad9..90ff24500b 100644 --- a/audio/paaudio.c +++ b/audio/paaudio.c @@ -547,11 +547,11 @@ static int qpa_init_out (HWVoiceOut *hw, struct audsettings *as) ss.rate = as->freq; /* - * qemu audio tick runs at 250 Hz (by default), so processing - * data chunks worth 4 ms of sound should be a good fit. + * qemu audio tick runs at 100 Hz (by default), so processing + * data chunks worth 10 ms of sound should be a good fit. */ - ba.tlength = pa_usec_to_bytes (4 * 1000, &ss); - ba.minreq = pa_usec_to_bytes (2 * 1000, &ss); + ba.tlength = pa_usec_to_bytes (10 * 1000, &ss); + ba.minreq = pa_usec_to_bytes (5 * 1000, &ss); ba.maxlength = -1; ba.prebuf = -1; diff --git a/audio/spiceaudio.c b/audio/spiceaudio.c index bc24557de4..fceee50adb 100644 --- a/audio/spiceaudio.c +++ b/audio/spiceaudio.c @@ -25,8 +25,17 @@ #include "audio.h" #include "audio_int.h" -#define LINE_IN_SAMPLES 1024 -#define LINE_OUT_SAMPLES 1024 +#if SPICE_INTERFACE_PLAYBACK_MAJOR > 1 || SPICE_INTERFACE_PLAYBACK_MINOR >= 3 +#define LINE_OUT_SAMPLES (480 * 4) +#else +#define LINE_OUT_SAMPLES (256 * 4) +#endif + +#if SPICE_INTERFACE_RECORD_MAJOR > 2 || SPICE_INTERFACE_RECORD_MINOR >= 3 +#define LINE_IN_SAMPLES (480 * 4) +#else +#define LINE_IN_SAMPLES (256 * 4) +#endif typedef struct SpiceRateCtl { int64_t start_ticks; @@ -81,7 +90,7 @@ static void spice_audio_fini (void *opaque) static void rate_start (SpiceRateCtl *rate) { memset (rate, 0, sizeof (*rate)); - rate->start_ticks = qemu_get_clock_ns (vm_clock); + rate->start_ticks = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); } static int rate_get_samples (struct audio_pcm_info *info, SpiceRateCtl *rate) @@ -91,7 +100,7 @@ static int rate_get_samples (struct audio_pcm_info *info, SpiceRateCtl *rate) int64_t bytes; int64_t samples; - now = qemu_get_clock_ns (vm_clock); + now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); ticks = now - rate->start_ticks; bytes = muldiv64 (ticks, info->bytes_per_second, get_ticks_per_sec ()); samples = (bytes - rate->bytes_sent) >> info->shift; @@ -111,7 +120,11 @@ static int line_out_init (HWVoiceOut *hw, struct audsettings *as) SpiceVoiceOut *out = container_of (hw, SpiceVoiceOut, hw); struct audsettings settings; +#if SPICE_INTERFACE_PLAYBACK_MAJOR > 1 || SPICE_INTERFACE_PLAYBACK_MINOR >= 3 + settings.freq = spice_server_get_best_playback_rate(NULL); +#else settings.freq = SPICE_INTERFACE_PLAYBACK_FREQ; +#endif settings.nchannels = SPICE_INTERFACE_PLAYBACK_CHAN; settings.fmt = AUD_FMT_S16; settings.endianness = AUDIO_HOST_ENDIANNESS; @@ -122,6 +135,9 @@ static int line_out_init (HWVoiceOut *hw, struct audsettings *as) out->sin.base.sif = &playback_sif.base; qemu_spice_add_interface (&out->sin.base); +#if SPICE_INTERFACE_PLAYBACK_MAJOR > 1 || SPICE_INTERFACE_PLAYBACK_MINOR >= 3 + spice_server_set_playback_rate(&out->sin, settings.freq); +#endif return 0; } @@ -232,7 +248,11 @@ static int line_in_init (HWVoiceIn *hw, struct audsettings *as) SpiceVoiceIn *in = container_of (hw, SpiceVoiceIn, hw); struct audsettings settings; +#if SPICE_INTERFACE_RECORD_MAJOR > 2 || SPICE_INTERFACE_RECORD_MINOR >= 3 + settings.freq = spice_server_get_best_record_rate(NULL); +#else settings.freq = SPICE_INTERFACE_RECORD_FREQ; +#endif settings.nchannels = SPICE_INTERFACE_RECORD_CHAN; settings.fmt = AUD_FMT_S16; settings.endianness = AUDIO_HOST_ENDIANNESS; @@ -243,6 +263,9 @@ static int line_in_init (HWVoiceIn *hw, struct audsettings *as) in->sin.base.sif = &record_sif.base; qemu_spice_add_interface (&in->sin.base); +#if SPICE_INTERFACE_RECORD_MAJOR > 2 || SPICE_INTERFACE_RECORD_MINOR >= 3 + spice_server_set_record_rate(&in->sin, settings.freq); +#endif return 0; } diff --git a/audio/wavaudio.c b/audio/wavaudio.c index 950fa8f19c..6846a1a9f7 100644 --- a/audio/wavaudio.c +++ b/audio/wavaudio.c @@ -52,7 +52,7 @@ static int wav_run_out (HWVoiceOut *hw, int live) int rpos, decr, samples; uint8_t *dst; struct st_sample *src; - int64_t now = qemu_get_clock_ns (vm_clock); + int64_t now = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); int64_t ticks = now - wav->old_ticks; int64_t bytes = muldiv64 (ticks, hw->info.bytes_per_second, get_ticks_per_sec ()); diff --git a/backends/Makefile.objs b/backends/Makefile.objs index 42557d54ea..506a46c33b 100644 --- a/backends/Makefile.objs +++ b/backends/Makefile.objs @@ -3,6 +3,9 @@ common-obj-$(CONFIG_POSIX) += rng-random.o common-obj-y += msmouse.o common-obj-$(CONFIG_BRLAPI) += baum.o -$(obj)/baum.o: QEMU_CFLAGS += $(SDL_CFLAGS) +baum.o-cflags := $(SDL_CFLAGS) common-obj-$(CONFIG_TPM) += tpm.o + +common-obj-y += hostmem.o hostmem-ram.o +common-obj-$(CONFIG_LINUX) += hostmem-file.o diff --git a/backends/baum.c b/backends/baum.c index 62aa784436..759003f623 100644 --- a/backends/baum.c +++ b/backends/baum.c @@ -314,9 +314,9 @@ static int baum_eat_packet(BaumDriverState *baum, const uint8_t *buf, int len) return 0; \ if (*cur++ != ESC) { \ DPRINTF("Broken packet %#2x, tossing\n", req); \ - if (qemu_timer_pending(baum->cellCount_timer)) { \ - qemu_del_timer(baum->cellCount_timer); \ - baum_cellCount_timer_cb(baum); \ + if (timer_pending(baum->cellCount_timer)) { \ + timer_del(baum->cellCount_timer); \ + baum_cellCount_timer_cb(baum); \ } \ return (cur - 2 - buf); \ } \ @@ -334,7 +334,7 @@ static int baum_eat_packet(BaumDriverState *baum, const uint8_t *buf, int len) int i; /* Allow 100ms to complete the DisplayData packet */ - qemu_mod_timer(baum->cellCount_timer, qemu_get_clock_ns(vm_clock) + + timer_mod(baum->cellCount_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + get_ticks_per_sec() / 10); for (i = 0; i < baum->x * baum->y ; i++) { EAT(c); @@ -348,7 +348,7 @@ static int baum_eat_packet(BaumDriverState *baum, const uint8_t *buf, int len) c = '?'; text[i] = c; } - qemu_del_timer(baum->cellCount_timer); + timer_del(baum->cellCount_timer); memset(zero, 0, sizeof(zero)); @@ -553,7 +553,7 @@ static void baum_close(struct CharDriverState *chr) { BaumDriverState *baum = chr->opaque; - qemu_free_timer(baum->cellCount_timer); + timer_free(baum->cellCount_timer); if (baum->brlapi) { brlapi__closeConnection(baum->brlapi); g_free(baum->brlapi); @@ -566,8 +566,10 @@ CharDriverState *chr_baum_init(void) BaumDriverState *baum; CharDriverState *chr; brlapi_handle_t *handle; -#ifdef CONFIG_SDL +#if defined(CONFIG_SDL) +#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0) SDL_SysWMinfo info; +#endif #endif int tty; @@ -588,19 +590,21 @@ CharDriverState *chr_baum_init(void) goto fail_handle; } - baum->cellCount_timer = qemu_new_timer_ns(vm_clock, baum_cellCount_timer_cb, baum); + baum->cellCount_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, baum_cellCount_timer_cb, baum); if (brlapi__getDisplaySize(handle, &baum->x, &baum->y) == -1) { brlapi_perror("baum_init: brlapi_getDisplaySize"); goto fail; } -#ifdef CONFIG_SDL +#if defined(CONFIG_SDL) +#if SDL_COMPILEDVERSION < SDL_VERSIONNUM(2, 0, 0) memset(&info, 0, sizeof(info)); SDL_VERSION(&info.version); if (SDL_GetWMInfo(&info)) tty = info.info.x11.wmwindow; else +#endif #endif tty = BRLAPI_TTY_DEFAULT; @@ -614,7 +618,7 @@ CharDriverState *chr_baum_init(void) return chr; fail: - qemu_free_timer(baum->cellCount_timer); + timer_free(baum->cellCount_timer); brlapi__closeConnection(handle); fail_handle: g_free(handle); diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c new file mode 100644 index 0000000000..7f3001198b --- /dev/null +++ b/backends/hostmem-file.c @@ -0,0 +1,134 @@ +/* + * QEMU Host Memory Backend for hugetlbfs + * + * Copyright (C) 2013 Red Hat Inc + * + * Authors: + * Paolo Bonzini + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu-common.h" +#include "sysemu/hostmem.h" +#include "sysemu/sysemu.h" +#include "qom/object_interfaces.h" + +/* hostmem-file.c */ +/** + * @TYPE_MEMORY_BACKEND_FILE: + * name of backend that uses mmap on a file descriptor + */ +#define TYPE_MEMORY_BACKEND_FILE "memory-file" + +#define MEMORY_BACKEND_FILE(obj) \ + OBJECT_CHECK(HostMemoryBackendFile, (obj), TYPE_MEMORY_BACKEND_FILE) + +typedef struct HostMemoryBackendFile HostMemoryBackendFile; + +struct HostMemoryBackendFile { + HostMemoryBackend parent_obj; + + bool share; + char *mem_path; +}; + +static void +file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(backend); + + if (!backend->size) { + error_setg(errp, "can't create backend with size 0"); + return; + } + if (!fb->mem_path) { + error_setg(errp, "mem-path property not set"); + return; + } +#ifndef CONFIG_LINUX + error_setg(errp, "-mem-path not supported on this host"); +#else + if (!memory_region_size(&backend->mr)) { + backend->force_prealloc = mem_prealloc; + memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), + object_get_canonical_path(OBJECT(backend)), + backend->size, fb->share, + fb->mem_path, errp); + } +#endif +} + +static void +file_backend_class_init(ObjectClass *oc, void *data) +{ + HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc); + + bc->alloc = file_backend_memory_alloc; +} + +static char *get_mem_path(Object *o, Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + return g_strdup(fb->mem_path); +} + +static void set_mem_path(Object *o, const char *str, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(o); + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + if (memory_region_size(&backend->mr)) { + error_setg(errp, "cannot change property value"); + return; + } + if (fb->mem_path) { + g_free(fb->mem_path); + } + fb->mem_path = g_strdup(str); +} + +static bool file_memory_backend_get_share(Object *o, Error **errp) +{ + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + return fb->share; +} + +static void file_memory_backend_set_share(Object *o, bool value, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(o); + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); + + if (memory_region_size(&backend->mr)) { + error_setg(errp, "cannot change property value"); + return; + } + fb->share = value; +} + +static void +file_backend_instance_init(Object *o) +{ + object_property_add_bool(o, "share", + file_memory_backend_get_share, + file_memory_backend_set_share, NULL); + object_property_add_str(o, "mem-path", get_mem_path, + set_mem_path, NULL); +} + +static const TypeInfo file_backend_info = { + .name = TYPE_MEMORY_BACKEND_FILE, + .parent = TYPE_MEMORY_BACKEND, + .class_init = file_backend_class_init, + .instance_init = file_backend_instance_init, + .instance_size = sizeof(HostMemoryBackendFile), +}; + +static void register_types(void) +{ + type_register_static(&file_backend_info); +} + +type_init(register_types); diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c new file mode 100644 index 0000000000..7cbc051fb0 --- /dev/null +++ b/backends/hostmem-ram.c @@ -0,0 +1,53 @@ +/* + * QEMU Host Memory Backend + * + * Copyright (C) 2013 Red Hat Inc + * + * Authors: + * Igor Mammedov + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "sysemu/hostmem.h" +#include "qom/object_interfaces.h" + +#define TYPE_MEMORY_BACKEND_RAM "memory-ram" + + +static void +ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) +{ + char *path; + + if (!backend->size) { + error_setg(errp, "can't create backend with size 0"); + return; + } + + path = object_get_canonical_path_component(OBJECT(backend)); + memory_region_init_ram(&backend->mr, OBJECT(backend), path, + backend->size); + g_free(path); +} + +static void +ram_backend_class_init(ObjectClass *oc, void *data) +{ + HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc); + + bc->alloc = ram_backend_memory_alloc; +} + +static const TypeInfo ram_backend_info = { + .name = TYPE_MEMORY_BACKEND_RAM, + .parent = TYPE_MEMORY_BACKEND, + .class_init = ram_backend_class_init, +}; + +static void register_types(void) +{ + type_register_static(&ram_backend_info); +} + +type_init(register_types); diff --git a/backends/hostmem.c b/backends/hostmem.c new file mode 100644 index 0000000000..e9f69c8119 --- /dev/null +++ b/backends/hostmem.c @@ -0,0 +1,365 @@ +/* + * QEMU Host Memory Backend + * + * Copyright (C) 2013 Red Hat Inc + * + * Authors: + * Igor Mammedov + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "sysemu/hostmem.h" +#include "qapi/visitor.h" +#include "qapi-types.h" +#include "qapi-visit.h" +#include "qapi/qmp/qerror.h" +#include "qom/object_interfaces.h" + +#ifdef CONFIG_NUMA +#include +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT); +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED); +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND); +QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE); +#endif + +static void +host_memory_backend_get_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + uint64_t value = backend->size; + + visit_type_size(v, &value, name, errp); +} + +static void +host_memory_backend_set_size(Object *obj, Visitor *v, void *opaque, + const char *name, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + uint64_t value; + + if (memory_region_size(&backend->mr)) { + error_setg(errp, "cannot change property value\n"); + return; + } + + visit_type_size(v, &value, name, errp); + if (error_is_set(errp)) { + return; + } + if (!value) { + error_setg(errp, "Property '%s.%s' doesn't take value '%" PRIu64 "'", + object_get_typename(obj), name , value); + return; + } + backend->size = value; +} + +static void +get_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name, + Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + uint16List *host_nodes = NULL; + uint16List **node = &host_nodes; + unsigned long value; + + value = find_first_bit(backend->host_nodes, MAX_NODES); + if (value == MAX_NODES) { + return; + } + + *node = g_malloc0(sizeof(**node)); + (*node)->value = value; + node = &(*node)->next; + + do { + value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1); + if (value == MAX_NODES) { + break; + } + + *node = g_malloc0(sizeof(**node)); + (*node)->value = value; + node = &(*node)->next; + } while (true); + + visit_type_uint16List(v, &host_nodes, name, errp); +} + +static void +set_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name, + Error **errp) +{ +#ifdef CONFIG_NUMA + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + uint16List *l = NULL; + + visit_type_uint16List(v, &l, name, errp); + + while (l) { + bitmap_set(backend->host_nodes, l->value, 1); + l = l->next; + } +#else + error_setg(errp, "NUMA node binding are not supported by this QEMU"); +#endif +} + +static void +get_policy(Object *obj, Visitor *v, void *opaque, const char *name, + Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + int policy = backend->policy; + + visit_type_enum(v, &policy, HostMemPolicy_lookup, NULL, name, errp); +} + +static void +set_policy(Object *obj, Visitor *v, void *opaque, const char *name, + Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + int policy; + + visit_type_enum(v, &policy, HostMemPolicy_lookup, NULL, name, errp); + backend->policy = policy; + +#ifndef CONFIG_NUMA + if (policy != HOST_MEM_POLICY_DEFAULT) { + error_setg(errp, "NUMA policies are not supported by this QEMU"); + } +#endif +} + +static bool host_memory_backend_get_merge(Object *obj, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + return backend->merge; +} + +static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (!memory_region_size(&backend->mr)) { + backend->merge = value; + return; + } + + if (value != backend->merge) { + void *ptr = memory_region_get_ram_ptr(&backend->mr); + uint64_t sz = memory_region_size(&backend->mr); + + qemu_madvise(ptr, sz, + value ? QEMU_MADV_MERGEABLE : QEMU_MADV_UNMERGEABLE); + backend->merge = value; + } +} + +static bool host_memory_backend_get_dump(Object *obj, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + return backend->dump; +} + +static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (!memory_region_size(&backend->mr)) { + backend->dump = value; + return; + } + + if (value != backend->dump) { + void *ptr = memory_region_get_ram_ptr(&backend->mr); + uint64_t sz = memory_region_size(&backend->mr); + + qemu_madvise(ptr, sz, + value ? QEMU_MADV_DODUMP : QEMU_MADV_DONTDUMP); + backend->dump = value; + } +} + +static bool host_memory_backend_get_prealloc(Object *obj, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + return backend->prealloc || backend->force_prealloc; +} + +static void host_memory_backend_set_prealloc(Object *obj, bool value, + Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (backend->force_prealloc) { + if (value) { + error_setg(errp, + "remove -mem-prealloc to use the prealloc property"); + return; + } + } + + if (!memory_region_size(&backend->mr)) { + backend->prealloc = value; + return; + } + + if (value && !backend->prealloc) { + int fd = memory_region_get_fd(&backend->mr); + void *ptr = memory_region_get_ram_ptr(&backend->mr); + uint64_t sz = memory_region_size(&backend->mr); + + os_mem_prealloc(fd, ptr, sz); + backend->prealloc = true; + } +} + + +static void host_memory_backend_initfn(Object *obj) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + backend->merge = qemu_opt_get_bool(qemu_get_machine_opts(), + "mem-merge", true); + backend->dump = qemu_opt_get_bool(qemu_get_machine_opts(), + "dump-guest-core", true); + backend->prealloc = mem_prealloc; + + object_property_add_bool(obj, "merge", + host_memory_backend_get_merge, + host_memory_backend_set_merge, NULL); + object_property_add_bool(obj, "dump", + host_memory_backend_get_dump, + host_memory_backend_set_dump, NULL); + object_property_add_bool(obj, "prealloc", + host_memory_backend_get_prealloc, + host_memory_backend_set_prealloc, NULL); + object_property_add(obj, "size", "int", + host_memory_backend_get_size, + host_memory_backend_set_size, NULL, NULL, NULL); + object_property_add(obj, "host-nodes", "int", + get_host_nodes, + set_host_nodes, NULL, NULL, NULL); + object_property_add(obj, "policy", "str", + get_policy, + set_policy, NULL, NULL, NULL); +} + +static void host_memory_backend_finalize(Object *obj) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(obj); + + if (memory_region_size(&backend->mr)) { + memory_region_destroy(&backend->mr); + } +} + +static void +host_memory_backend_memory_init(UserCreatable *uc, Error **errp) +{ + HostMemoryBackend *backend = MEMORY_BACKEND(uc); + HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc); + Error *local_err = NULL; + void *ptr; + uint64_t sz; + + if (!bc->alloc) { + error_setg(errp, "memory_alloc is not implemented for type [%s]", + object_get_typename(OBJECT(uc))); + return; + } + + bc->alloc(backend, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + ptr = memory_region_get_ram_ptr(&backend->mr); + sz = memory_region_size(&backend->mr); + + if (backend->merge) { + qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE); + } + if (!backend->dump) { + qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP); + } +#ifdef CONFIG_NUMA + unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES); + + /* check for invalid host-nodes and policies and give more verbose + * error messages than mbind(). */ + if (maxnode != MAX_NODES && backend->policy == MPOL_DEFAULT) { + error_setg(errp, "host-nodes must be empty for policy default," + " or you should explicitly specify a policy other" + " than default"); + return; + } else if (maxnode == MAX_NODES && backend->policy != MPOL_DEFAULT) { + error_setg(errp, "host-nodes must be set for policy %s", + HostMemPolicy_lookup[backend->policy]); + return; + } + + /* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. + */ + if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 0)) { + error_setg_errno(errp, errno, + "cannot bind memory to host NUMA nodes"); + return; + } +#endif + /* Preallocate memory after the NUMA policy has been instantiated. + * This is necessary to guarantee memory is allocated with + * specified NUMA policy in place. + */ + if (backend->prealloc) { + os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz); + } +} + +MemoryRegion * +host_memory_backend_get_memory(HostMemoryBackend *backend, Error **errp) +{ + return memory_region_size(&backend->mr) ? &backend->mr : NULL; +} + +static void +host_memory_backend_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->complete = host_memory_backend_memory_init; +} + +static const TypeInfo host_memory_backend_info = { + .name = TYPE_MEMORY_BACKEND, + .parent = TYPE_OBJECT, + .abstract = true, + .class_size = sizeof(HostMemoryBackendClass), + .class_init = host_memory_backend_class_init, + .instance_size = sizeof(HostMemoryBackend), + .instance_init = host_memory_backend_initfn, + .instance_finalize = host_memory_backend_finalize, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static void register_types(void) +{ + type_register_static(&host_memory_backend_info); +} + +type_init(register_types); diff --git a/backends/rng-egd.c b/backends/rng-egd.c index 9e5a5366f7..25bb3b453b 100644 --- a/backends/rng-egd.c +++ b/backends/rng-egd.c @@ -91,12 +91,14 @@ static int rng_egd_chr_can_read(void *opaque) static void rng_egd_chr_read(void *opaque, const uint8_t *buf, int size) { RngEgd *s = RNG_EGD(opaque); + size_t buf_offset = 0; while (size > 0 && s->requests) { RngRequest *req = s->requests->data; int len = MIN(size, req->size - req->offset); - memcpy(req->data + req->offset, buf, len); + memcpy(req->data + req->offset, buf + buf_offset, len); + buf_offset += len; req->offset += len; size -= len; @@ -167,7 +169,6 @@ static void rng_egd_set_chardev(Object *obj, const char *value, Error **errp) if (b->opened) { error_set(errp, QERR_PERMISSION_DENIED); } else { - g_free(s->chr_name); s->chr_name = g_strdup(value); } } diff --git a/backends/rng-random.c b/backends/rng-random.c index 68dfc8a9c6..136499d305 100644 --- a/backends/rng-random.c +++ b/backends/rng-random.c @@ -123,15 +123,15 @@ static void rng_random_init(Object *obj) NULL); s->filename = g_strdup("/dev/random"); + s->fd = -1; } static void rng_random_finalize(Object *obj) { RndRandom *s = RNG_RANDOM(obj); - qemu_set_fd_handler(s->fd, NULL, NULL, NULL); - if (s->fd != -1) { + qemu_set_fd_handler(s->fd, NULL, NULL, NULL); qemu_close(s->fd); } diff --git a/backends/rng.c b/backends/rng.c index 85cb83f5e1..0f2fc11dd8 100644 --- a/backends/rng.c +++ b/backends/rng.c @@ -12,6 +12,7 @@ #include "sysemu/rng.h" #include "qapi/qmp/qerror.h" +#include "qom/object_interfaces.h" void rng_backend_request_entropy(RngBackend *s, size_t size, EntropyReceiveFunc *receive_entropy, @@ -40,15 +41,16 @@ static bool rng_backend_prop_get_opened(Object *obj, Error **errp) return s->opened; } -void rng_backend_open(RngBackend *s, Error **errp) +static void rng_backend_complete(UserCreatable *uc, Error **errp) { - object_property_set_bool(OBJECT(s), true, "opened", errp); + object_property_set_bool(OBJECT(uc), true, "opened", errp); } static void rng_backend_prop_set_opened(Object *obj, bool value, Error **errp) { RngBackend *s = RNG_BACKEND(obj); RngBackendClass *k = RNG_BACKEND_GET_CLASS(s); + Error *local_err = NULL; if (value == s->opened) { return; @@ -60,12 +62,14 @@ static void rng_backend_prop_set_opened(Object *obj, bool value, Error **errp) } if (k->opened) { - k->opened(s, errp); + k->opened(s, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } } - if (!error_is_set(errp)) { - s->opened = value; - } + s->opened = true; } static void rng_backend_init(Object *obj) @@ -76,13 +80,25 @@ static void rng_backend_init(Object *obj) NULL); } +static void rng_backend_class_init(ObjectClass *oc, void *data) +{ + UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); + + ucc->complete = rng_backend_complete; +} + static const TypeInfo rng_backend_info = { .name = TYPE_RNG_BACKEND, .parent = TYPE_OBJECT, .instance_size = sizeof(RngBackend), .instance_init = rng_backend_init, .class_size = sizeof(RngBackendClass), + .class_init = rng_backend_class_init, .abstract = true, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } }; static void register_types(void) diff --git a/backends/tpm.c b/backends/tpm.c index b73580134c..01860c4acb 100644 --- a/backends/tpm.c +++ b/backends/tpm.c @@ -112,6 +112,7 @@ static void tpm_backend_prop_set_opened(Object *obj, bool value, Error **errp) { TPMBackend *s = TPM_BACKEND(obj); TPMBackendClass *k = TPM_BACKEND_GET_CLASS(s); + Error *local_err = NULL; if (value == s->opened) { return; @@ -123,12 +124,14 @@ static void tpm_backend_prop_set_opened(Object *obj, bool value, Error **errp) } if (k->opened) { - k->opened(s, errp); + k->opened(s, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } } - if (!error_is_set(errp)) { - s->opened = value; - } + s->opened = true; } static void tpm_backend_instance_init(Object *obj) diff --git a/block-migration.c b/block-migration.c index f803f2006f..56951e09ae 100644 --- a/block-migration.c +++ b/block-migration.c @@ -58,6 +58,7 @@ typedef struct BlkMigDevState { /* Protected by block migration lock. */ unsigned long *aio_bitmap; int64_t completed_sectors; + BdrvDirtyBitmap *dirty_bitmap; } BlkMigDevState; typedef struct BlkMigBlock { @@ -309,12 +310,36 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) /* Called with iothread lock taken. */ -static void set_dirty_tracking(int enable) +static int set_dirty_tracking(void) +{ + BlkMigDevState *bmds; + int ret; + + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { + bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE, + NULL); + if (!bmds->dirty_bitmap) { + ret = -errno; + goto fail; + } + } + return 0; + +fail: + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { + if (bmds->dirty_bitmap) { + bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); + } + } + return ret; +} + +static void unset_dirty_tracking(void) { BlkMigDevState *bmds; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - bdrv_set_dirty_tracking(bmds->bs, enable ? BLOCK_SIZE : 0); + bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); } } @@ -336,8 +361,8 @@ static void init_blk_migration_it(void *opaque, BlockDriverState *bs) bmds->completed_sectors = 0; bmds->shared_base = block_mig_state.shared_base; alloc_aio_bitmap(bmds); - drive_get_ref(drive_get_by_blockdev(bs)); bdrv_set_in_use(bs, 1); + bdrv_ref(bs); block_mig_state.total_sector_sum += sectors; @@ -432,7 +457,7 @@ static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, } else { blk_mig_unlock(); } - if (bdrv_get_dirty(bmds->bs, sector)) { + if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) { if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { nr_sectors = total_sectors - sector; @@ -554,7 +579,7 @@ static int64_t get_remaining_dirty(void) int64_t dirty = 0; QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - dirty += bdrv_get_dirty_count(bmds->bs); + dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap); } return dirty << BDRV_SECTOR_BITS; @@ -569,13 +594,13 @@ static void blk_mig_cleanup(void) bdrv_drain_all(); - set_dirty_tracking(0); + unset_dirty_tracking(); blk_mig_lock(); while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); bdrv_set_in_use(bmds->bs, 0); - drive_put_ref(drive_get_by_blockdev(bmds->bs)); + bdrv_unref(bmds->bs); g_free(bmds->aio_bitmap); g_free(bmds); } @@ -601,10 +626,17 @@ static int block_save_setup(QEMUFile *f, void *opaque) block_mig_state.submitted, block_mig_state.transferred); qemu_mutex_lock_iothread(); - init_blk_migration(f); /* start track dirty blocks */ - set_dirty_tracking(1); + ret = set_dirty_tracking(); + + if (ret) { + qemu_mutex_unlock_iothread(); + return ret; + } + + init_blk_migration(f); + qemu_mutex_unlock_iothread(); ret = flush_blks(f); @@ -780,7 +812,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) } if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { - ret = bdrv_write_zeroes(bs, addr, nr_sectors); + ret = bdrv_write_zeroes(bs, addr, nr_sectors, + BDRV_REQ_MAY_UNMAP); } else { buf = g_malloc(BLOCK_SIZE); qemu_get_buffer(f, buf, BLOCK_SIZE); diff --git a/block.c b/block.c index 01b66d802a..c90c71aa32 100644 --- a/block.c +++ b/block.c @@ -32,6 +32,7 @@ #include "sysemu/sysemu.h" #include "qemu/notify.h" #include "block/coroutine.h" +#include "block/qapi.h" #include "qmp-commands.h" #include "qemu/timer.h" @@ -49,12 +50,12 @@ #include #endif -#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ +struct BdrvDirtyBitmap { + HBitmap *bitmap; + QLIST_ENTRY(BdrvDirtyBitmap) list; +}; -typedef enum { - BDRV_REQ_COPY_ON_READ = 0x1, - BDRV_REQ_ZERO_WRITE = 0x2, -} BdrvRequestFlags; +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, @@ -69,33 +70,30 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags); static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BdrvRequestFlags flags, BlockDriverCompletionFunc *cb, void *opaque, bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors); - -static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, - bool is_write, double elapsed_time, uint64_t *wait); -static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, - double elapsed_time, uint64_t *wait); -static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, - bool is_write, int64_t *wait); + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); +static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states = + QTAILQ_HEAD_INITIALIZER(graph_bdrv_states); + static QLIST_HEAD(, BlockDriver) bdrv_drivers = QLIST_HEAD_INITIALIZER(bdrv_drivers); @@ -123,70 +121,110 @@ int is_windows_drive(const char *filename) #endif /* throttling disk I/O limits */ -void bdrv_io_limits_disable(BlockDriverState *bs) +void bdrv_set_io_limits(BlockDriverState *bs, + ThrottleConfig *cfg) { - bs->io_limits_enabled = false; + int i; + + throttle_config(&bs->throttle_state, cfg); + + for (i = 0; i < 2; i++) { + qemu_co_enter_next(&bs->throttled_reqs[i]); + } +} + +/* this function drain all the throttled IOs */ +static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +{ + bool drained = false; + bool enabled = bs->io_limits_enabled; + int i; - do {} while (qemu_co_enter_next(&bs->throttled_reqs)); + bs->io_limits_enabled = false; - if (bs->block_timer) { - qemu_del_timer(bs->block_timer); - qemu_free_timer(bs->block_timer); - bs->block_timer = NULL; + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&bs->throttled_reqs[i])) { + drained = true; + } } - bs->slice_start = 0; - bs->slice_end = 0; + bs->io_limits_enabled = enabled; + + return drained; } -static void bdrv_block_timer(void *opaque) +void bdrv_io_limits_disable(BlockDriverState *bs) { - BlockDriverState *bs = opaque; + bs->io_limits_enabled = false; + + bdrv_start_throttled_reqs(bs); - qemu_co_enter_next(&bs->throttled_reqs); + throttle_destroy(&bs->throttle_state); } -void bdrv_io_limits_enable(BlockDriverState *bs) +static void bdrv_throttle_read_timer_cb(void *opaque) { - qemu_co_queue_init(&bs->throttled_reqs); - bs->block_timer = qemu_new_timer_ns(vm_clock, bdrv_block_timer, bs); - bs->io_limits_enabled = true; + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[0]); +} + +static void bdrv_throttle_write_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + qemu_co_enter_next(&bs->throttled_reqs[1]); } -bool bdrv_io_limits_enabled(BlockDriverState *bs) +/* should be called before bdrv_set_io_limits if a limit is set */ +void bdrv_io_limits_enable(BlockDriverState *bs) { - BlockIOLimit *io_limits = &bs->io_limits; - return io_limits->bps[BLOCK_IO_LIMIT_READ] - || io_limits->bps[BLOCK_IO_LIMIT_WRITE] - || io_limits->bps[BLOCK_IO_LIMIT_TOTAL] - || io_limits->iops[BLOCK_IO_LIMIT_READ] - || io_limits->iops[BLOCK_IO_LIMIT_WRITE] - || io_limits->iops[BLOCK_IO_LIMIT_TOTAL]; + assert(!bs->io_limits_enabled); + throttle_init(&bs->throttle_state, + QEMU_CLOCK_VIRTUAL, + bdrv_throttle_read_timer_cb, + bdrv_throttle_write_timer_cb, + bs); + bs->io_limits_enabled = true; } +/* This function makes an IO wait if needed + * + * @nb_sectors: the number of sectors of the IO + * @is_write: is the IO a write + */ static void bdrv_io_limits_intercept(BlockDriverState *bs, - bool is_write, int nb_sectors) + unsigned int bytes, + bool is_write) { - int64_t wait_time = -1; + /* does this io must wait */ + bool must_wait = throttle_schedule_timer(&bs->throttle_state, is_write); - if (!qemu_co_queue_empty(&bs->throttled_reqs)) { - qemu_co_queue_wait(&bs->throttled_reqs); + /* if must wait or any request of this type throttled queue the IO */ + if (must_wait || + !qemu_co_queue_empty(&bs->throttled_reqs[is_write])) { + qemu_co_queue_wait(&bs->throttled_reqs[is_write]); } - /* In fact, we hope to keep each request's timing, in FIFO mode. The next - * throttled requests will not be dequeued until the current request is - * allowed to be serviced. So if the current request still exceeds the - * limits, it will be inserted to the head. All requests followed it will - * be still in throttled_reqs queue. - */ + /* the IO will be executed, do the accounting */ + throttle_account(&bs->throttle_state, is_write, bytes); - while (bdrv_exceed_io_limits(bs, nb_sectors, is_write, &wait_time)) { - qemu_mod_timer(bs->block_timer, - wait_time + qemu_get_clock_ns(vm_clock)); - qemu_co_queue_wait_insert_head(&bs->throttled_reqs); + + /* if the next request must wait -> do nothing */ + if (throttle_schedule_timer(&bs->throttle_state, is_write)) { + return; + } + + /* else queue next request for execution */ + qemu_co_queue_next(&bs->throttled_reqs[is_write]); +} + +size_t bdrv_opt_mem_align(BlockDriverState *bs) +{ + if (!bs || !bs->drv) { + /* 4k should be on the safe side */ + return 4096; } - qemu_co_queue_next(&bs->throttled_reqs); + return bs->bl.opt_mem_alignment; } /* check if the path starts with ":" */ @@ -294,18 +332,33 @@ void bdrv_register(BlockDriver *bdrv) } /* create a new block device (by default it is empty) */ -BlockDriverState *bdrv_new(const char *device_name) +BlockDriverState *bdrv_new(const char *device_name, Error **errp) { BlockDriverState *bs; + if (bdrv_find(device_name)) { + error_setg(errp, "Device with id '%s' already exists", + device_name); + return NULL; + } + if (bdrv_find_node(device_name)) { + error_setg(errp, "Device with node-name '%s' already exists", + device_name); + return NULL; + } + bs = g_malloc0(sizeof(BlockDriverState)); + QLIST_INIT(&bs->dirty_bitmaps); pstrcpy(bs->device_name, sizeof(bs->device_name), device_name); if (device_name[0] != '\0') { - QTAILQ_INSERT_TAIL(&bdrv_states, bs, list); + QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list); } bdrv_iostatus_disable(bs); notifier_list_init(&bs->close_notifiers); notifier_with_return_list_init(&bs->before_write_notifiers); + qemu_co_queue_init(&bs->throttled_reqs[0]); + qemu_co_queue_init(&bs->throttled_reqs[1]); + bs->refcnt = 1; return bs; } @@ -367,18 +420,26 @@ typedef struct CreateCo { char *filename; QEMUOptionParameter *options; int ret; + Error *err; } CreateCo; static void coroutine_fn bdrv_create_co_entry(void *opaque) { + Error *local_err = NULL; + int ret; + CreateCo *cco = opaque; assert(cco->drv); - cco->ret = cco->drv->bdrv_create(cco->filename, cco->options); + ret = cco->drv->bdrv_create(cco->filename, cco->options, &local_err); + if (local_err) { + error_propagate(&cco->err, local_err); + } + cco->ret = ret; } int bdrv_create(BlockDriver *drv, const char* filename, - QEMUOptionParameter *options) + QEMUOptionParameter *options, Error **errp) { int ret; @@ -388,9 +449,11 @@ int bdrv_create(BlockDriver *drv, const char* filename, .filename = g_strdup(filename), .options = options, .ret = NOT_DONE, + .err = NULL, }; if (!drv->bdrv_create) { + error_setg(errp, "Driver '%s' does not support image creation", drv->format_name); ret = -ENOTSUP; goto out; } @@ -407,22 +470,74 @@ int bdrv_create(BlockDriver *drv, const char* filename, } ret = cco.ret; + if (ret < 0) { + if (cco.err) { + error_propagate(errp, cco.err); + } else { + error_setg_errno(errp, -ret, "Could not create image"); + } + } out: g_free(cco.filename); return ret; } -int bdrv_create_file(const char* filename, QEMUOptionParameter *options) +int bdrv_create_file(const char* filename, QEMUOptionParameter *options, + Error **errp) { BlockDriver *drv; + Error *local_err = NULL; + int ret; drv = bdrv_find_protocol(filename, true); if (drv == NULL) { + error_setg(errp, "Could not find protocol for file '%s'", filename); return -ENOENT; } - return bdrv_create(drv, filename, options); + ret = bdrv_create(drv, filename, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; +} + +int bdrv_refresh_limits(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (!drv) { + return 0; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file); + bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; + bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + } else { + bs->bl.opt_mem_alignment = 512; + } + + if (bs->backing_hd) { + bdrv_refresh_limits(bs->backing_hd); + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing_hd->bl.opt_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing_hd->bl.opt_mem_alignment); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { + return drv->bdrv_refresh_limits(bs); + } + + return 0; } /* @@ -443,8 +558,9 @@ int get_tmp_filename(char *filename, int size) int fd; const char *tmpdir; tmpdir = getenv("TMPDIR"); - if (!tmpdir) - tmpdir = "/tmp"; + if (!tmpdir) { + tmpdir = "/var/tmp"; + } if (snprintf(filename, size, "%s/vl.XXXXXX", tmpdir) >= size) { return -EOVERFLOW; } @@ -525,7 +641,7 @@ BlockDriver *bdrv_find_protocol(const char *filename, } static int find_image_format(BlockDriverState *bs, const char *filename, - BlockDriver **pdrv) + BlockDriver **pdrv, Error **errp) { int score, score_max; BlockDriver *drv1, *drv; @@ -536,6 +652,7 @@ static int find_image_format(BlockDriverState *bs, const char *filename, if (bs->sg || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) { drv = bdrv_find_format("raw"); if (!drv) { + error_setg(errp, "Could not find raw image format"); ret = -ENOENT; } *pdrv = drv; @@ -544,6 +661,8 @@ static int find_image_format(BlockDriverState *bs, const char *filename, ret = bdrv_pread(bs, 0, buf, sizeof(buf)); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read image for determining its " + "format"); *pdrv = NULL; return ret; } @@ -560,6 +679,8 @@ static int find_image_format(BlockDriverState *bs, const char *filename, } } if (!drv) { + error_setg(errp, "Could not determine image format: No compatible " + "driver found"); ret = -ENOENT; } *pdrv = drv; @@ -583,7 +704,7 @@ static int refresh_total_sectors(BlockDriverState *bs, int64_t hint) if (length < 0) { return length; } - hint = length >> BDRV_SECTOR_BITS; + hint = DIV_ROUND_UP(length, BDRV_SECTOR_SIZE); } bs->total_sectors = hint; @@ -653,6 +774,50 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs) bs->copy_on_read--; } +/* + * Returns the flags that a temporary snapshot should get, based on the + * originally requested flags (the originally requested image will have flags + * like a backing file) + */ +static int bdrv_temp_snapshot_flags(int flags) +{ + return (flags & ~BDRV_O_SNAPSHOT) | BDRV_O_TEMPORARY; +} + +/* + * Returns the flags that bs->file should get, based on the given flags for + * the parent BDS + */ +static int bdrv_inherited_flags(int flags) +{ + /* Enable protocol handling, disable format probing for bs->file */ + flags |= BDRV_O_PROTOCOL; + + /* Our block drivers take care to send flushes and respect unmap policy, + * so we can enable both unconditionally on lower layers. */ + flags |= BDRV_O_CACHE_WB | BDRV_O_UNMAP; + + /* Clear flags that only apply to the top layer */ + flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | BDRV_O_COPY_ON_READ); + + return flags; +} + +/* + * Returns the flags that bs->backing_hd should get, based on the given flags + * for the parent BDS + */ +static int bdrv_backing_flags(int flags) +{ + /* backing files always opened read-only */ + flags &= ~(BDRV_O_RDWR | BDRV_O_COPY_ON_READ); + + /* snapshot=on is handled on the top layer */ + flags &= ~(BDRV_O_SNAPSHOT | BDRV_O_TEMPORARY); + + return flags; +} + static int bdrv_open_flags(BlockDriverState *bs, int flags) { int open_flags = flags | BDRV_O_CACHE_WB; @@ -666,23 +831,57 @@ static int bdrv_open_flags(BlockDriverState *bs, int flags) /* * Snapshots should be writable. */ - if (bs->is_temporary) { + if (flags & BDRV_O_TEMPORARY) { open_flags |= BDRV_O_RDWR; } return open_flags; } +static void bdrv_assign_node_name(BlockDriverState *bs, + const char *node_name, + Error **errp) +{ + if (!node_name) { + return; + } + + /* empty string node name is invalid */ + if (node_name[0] == '\0') { + error_setg(errp, "Empty node name"); + return; + } + + /* takes care of avoiding namespaces collisions */ + if (bdrv_find(node_name)) { + error_setg(errp, "node-name=%s is conflicting with a device id", + node_name); + return; + } + + /* takes care of avoiding duplicates node names */ + if (bdrv_find_node(node_name)) { + error_setg(errp, "Duplicate node name"); + return; + } + + /* copy node name into the bs and insert it into the graph list */ + pstrcpy(bs->node_name, sizeof(bs->node_name), node_name); + QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs, node_list); +} + /* * Common part for opening disk images and files * * Removes all processed options from *options. */ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, - QDict *options, int flags, BlockDriver *drv) + QDict *options, int flags, BlockDriver *drv, Error **errp) { int ret, open_flags; const char *filename; + const char *node_name = NULL; + Error *local_err = NULL; assert(drv != NULL); assert(bs->file == NULL); @@ -694,8 +893,22 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, filename = qdict_get_try_str(options, "filename"); } + if (drv->bdrv_needs_filename && !filename) { + error_setg(errp, "The '%s' block driver requires a file name", + drv->format_name); + return -EINVAL; + } + trace_bdrv_open_common(bs, filename ?: "", flags, drv->format_name); + node_name = qdict_get_try_str(options, "node-name"); + bdrv_assign_node_name(bs, node_name, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; + } + qdict_del(options, "node-name"); + /* bdrv_open() with directly using a protocol as drv. This layer is already * opened, so assign it to bs (while file becomes a closed BlockDriverState) * and return immediately. */ @@ -705,17 +918,29 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } bs->open_flags = flags; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; + bs->request_alignment = 512; + bs->zero_beyond_eof = true; open_flags = bdrv_open_flags(bs, flags); bs->read_only = !(open_flags & BDRV_O_RDWR); if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) { + error_setg(errp, + !bs->read_only && bdrv_is_whitelisted(drv, true) + ? "Driver '%s' can only be used for read-only devices" + : "Driver '%s' is not whitelisted", + drv->format_name); return -ENOTSUP; } assert(bs->copy_on_read == 0); /* bdrv_new() and bdrv_close() make it so */ - if (!bs->read_only && (flags & BDRV_O_COPY_ON_READ)) { - bdrv_enable_copy_on_read(bs); + if (flags & BDRV_O_COPY_ON_READ) { + if (!bs->read_only) { + bdrv_enable_copy_on_read(bs); + } else { + error_setg(errp, "Can't use copy-on-read on read-only device"); + return -EINVAL; + } } if (filename != NULL) { @@ -732,36 +957,39 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, /* Open the image, either directly or using a protocol */ if (drv->bdrv_file_open) { assert(file == NULL); - assert(drv->bdrv_parse_filename || filename != NULL); - ret = drv->bdrv_file_open(bs, options, open_flags); + assert(!drv->bdrv_needs_filename || filename != NULL); + ret = drv->bdrv_file_open(bs, options, open_flags, &local_err); } else { if (file == NULL) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't use '%s' as a " - "block driver for the protocol level", - drv->format_name); + error_setg(errp, "Can't use '%s' as a block driver for the " + "protocol level", drv->format_name); ret = -EINVAL; goto free_and_fail; } - assert(file != NULL); bs->file = file; - ret = drv->bdrv_open(bs, options, open_flags); + ret = drv->bdrv_open(bs, options, open_flags, &local_err); } if (ret < 0) { + if (local_err) { + error_propagate(errp, local_err); + } else if (bs->filename[0]) { + error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename); + } else { + error_setg_errno(errp, -ret, "Could not open image"); + } goto free_and_fail; } ret = refresh_total_sectors(bs, bs->total_sectors); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not refresh total sector count"); goto free_and_fail; } -#ifndef _WIN32 - if (bs->is_temporary) { - assert(filename != NULL); - unlink(filename); - } -#endif + bdrv_refresh_limits(bs); + assert(bdrv_opt_mem_align(bs) != 0); + assert((bs->request_alignment != 0) || bs->sg); return 0; free_and_fail: @@ -775,108 +1003,90 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, /* * Opens a file using a protocol (file, host_device, nbd, ...) * - * options is a QDict of options to pass to the block drivers, or NULL for an - * empty set of options. The reference to the QDict belongs to the block layer - * after the call (even on failure), so if the caller intends to reuse the - * dictionary, it needs to use QINCREF() before calling bdrv_file_open. + * options is an indirect pointer to a QDict of options to pass to the block + * drivers, or pointer to NULL for an empty set of options. If this function + * takes ownership of the QDict reference, it will set *options to NULL; + * otherwise, it will contain unused/unrecognized options after this function + * returns. Then, the caller is responsible for freeing it. If it intends to + * reuse the QDict, QINCREF() should be called beforehand. */ -int bdrv_file_open(BlockDriverState **pbs, const char *filename, - QDict *options, int flags) +static int bdrv_file_open(BlockDriverState *bs, const char *filename, + QDict **options, int flags, Error **errp) { - BlockDriverState *bs; BlockDriver *drv; const char *drvname; - bool allow_protocol_prefix = false; + bool parse_filename = false; + Error *local_err = NULL; int ret; - /* NULL means an empty set of options */ - if (options == NULL) { - options = qdict_new(); - } - - bs = bdrv_new(""); - bs->options = options; - options = qdict_clone_shallow(options); - /* Fetch the file name from the options QDict if necessary */ if (!filename) { - filename = qdict_get_try_str(options, "filename"); - } else if (filename && !qdict_haskey(options, "filename")) { - qdict_put(options, "filename", qstring_from_str(filename)); - allow_protocol_prefix = true; + filename = qdict_get_try_str(*options, "filename"); + } else if (filename && !qdict_haskey(*options, "filename")) { + qdict_put(*options, "filename", qstring_from_str(filename)); + parse_filename = true; } else { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Can't specify 'file' and " - "'filename' options at the same time"); + error_setg(errp, "Can't specify 'file' and 'filename' options at the " + "same time"); ret = -EINVAL; goto fail; } /* Find the right block driver */ - drvname = qdict_get_try_str(options, "driver"); + drvname = qdict_get_try_str(*options, "driver"); if (drvname) { - drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR)); - qdict_del(options, "driver"); + drv = bdrv_find_format(drvname); + if (!drv) { + error_setg(errp, "Unknown driver '%s'", drvname); + } + qdict_del(*options, "driver"); } else if (filename) { - drv = bdrv_find_protocol(filename, allow_protocol_prefix); + drv = bdrv_find_protocol(filename, parse_filename); if (!drv) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Unknown protocol"); + error_setg(errp, "Unknown protocol"); } } else { - qerror_report(ERROR_CLASS_GENERIC_ERROR, - "Must specify either driver or file"); + error_setg(errp, "Must specify either driver or file"); drv = NULL; } if (!drv) { + /* errp has been set already */ ret = -ENOENT; goto fail; } /* Parse the filename and open it */ - if (drv->bdrv_parse_filename && filename) { - Error *local_err = NULL; - drv->bdrv_parse_filename(filename, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (drv->bdrv_parse_filename && parse_filename) { + drv->bdrv_parse_filename(filename, *options, &local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } - qdict_del(options, "filename"); - } else if (!drv->bdrv_parse_filename && !filename) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, - "The '%s' block driver requires a file name", - drv->format_name); - ret = -EINVAL; - goto fail; - } - ret = bdrv_open_common(bs, NULL, options, flags, drv); - if (ret < 0) { - goto fail; + if (!drv->bdrv_needs_filename) { + qdict_del(*options, "filename"); + } else { + filename = qdict_get_str(*options, "filename"); + } } - /* Check if any unknown options were used */ - if (qdict_size(options) != 0) { - const QDictEntry *entry = qdict_first(options); - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block protocol '%s' doesn't " - "support the option '%s'", - drv->format_name, entry->key); - ret = -EINVAL; + if (!drv->bdrv_file_open) { + ret = bdrv_open(&bs, filename, NULL, *options, flags, drv, &local_err); + *options = NULL; + } else { + ret = bdrv_open_common(bs, NULL, *options, flags, drv, &local_err); + } + if (ret < 0) { + error_propagate(errp, local_err); goto fail; } - QDECREF(options); bs->growable = 1; - *pbs = bs; return 0; fail: - QDECREF(options); - if (!bs->drv) { - QDECREF(bs->options); - } - bdrv_delete(bs); return ret; } @@ -888,15 +1098,16 @@ int bdrv_file_open(BlockDriverState **pbs, const char *filename, * function (even on failure), so if the caller intends to reuse the dictionary, * it needs to use QINCREF() before calling bdrv_file_open. */ -int bdrv_open_backing_file(BlockDriverState *bs, QDict *options) +int bdrv_open_backing_file(BlockDriverState *bs, QDict *options, Error **errp) { - char backing_filename[PATH_MAX]; - int back_flags, ret; + char *backing_filename = g_malloc0(PATH_MAX); + int ret = 0; BlockDriver *back_drv = NULL; + Error *local_err = NULL; if (bs->backing_hd != NULL) { QDECREF(options); - return 0; + goto free_exit; } /* NULL means an empty set of options */ @@ -909,179 +1120,291 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict *options) backing_filename[0] = '\0'; } else if (bs->backing_file[0] == '\0' && qdict_size(options) == 0) { QDECREF(options); - return 0; + goto free_exit; + } else { + bdrv_get_full_backing_filename(bs, backing_filename, PATH_MAX); } - bs->backing_hd = bdrv_new(""); - bdrv_get_full_backing_filename(bs, backing_filename, - sizeof(backing_filename)); - if (bs->backing_format[0] != '\0') { back_drv = bdrv_find_format(bs->backing_format); } - /* backing files always opened read-only */ - back_flags = bs->open_flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT); - - ret = bdrv_open(bs->backing_hd, - *backing_filename ? backing_filename : NULL, options, - back_flags, back_drv); + assert(bs->backing_hd == NULL); + ret = bdrv_open(&bs->backing_hd, + *backing_filename ? backing_filename : NULL, NULL, options, + bdrv_backing_flags(bs->open_flags), back_drv, &local_err); if (ret < 0) { - bdrv_delete(bs->backing_hd); bs->backing_hd = NULL; bs->open_flags |= BDRV_O_NO_BACKING; - return ret; + error_setg(errp, "Could not open backing file: %s", + error_get_pretty(local_err)); + error_free(local_err); + goto free_exit; } - return 0; -} -static void extract_subqdict(QDict *src, QDict **dst, const char *start) -{ - const QDictEntry *entry, *next; - const char *p; + if (bs->backing_hd->file) { + pstrcpy(bs->backing_file, sizeof(bs->backing_file), + bs->backing_hd->file->filename); + } - *dst = qdict_new(); - entry = qdict_first(src); + /* Recalculate the BlockLimits with the backing file */ + bdrv_refresh_limits(bs); - while (entry != NULL) { - next = qdict_next(src, entry); - if (strstart(entry->key, start, &p)) { - qobject_incref(entry->value); - qdict_put_obj(*dst, p, entry->value); - qdict_del(src, entry->key); - } - entry = next; - } +free_exit: + g_free(backing_filename); + return ret; } /* - * Opens a disk image (raw, qcow2, vmdk, ...) + * Opens a disk image whose options are given as BlockdevRef in another block + * device's options. * - * options is a QDict of options to pass to the block drivers, or NULL for an - * empty set of options. The reference to the QDict belongs to the block layer - * after the call (even on failure), so if the caller intends to reuse the - * dictionary, it needs to use QINCREF() before calling bdrv_open. + * If allow_none is true, no image will be opened if filename is false and no + * BlockdevRef is given. *pbs will remain unchanged and 0 will be returned. + * + * bdrev_key specifies the key for the image's BlockdevRef in the options QDict. + * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict + * itself, all options starting with "${bdref_key}." are considered part of the + * BlockdevRef. + * + * The BlockdevRef will be removed from the options QDict. + * + * To conform with the behavior of bdrv_open(), *pbs has to be NULL. */ -int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, - int flags, BlockDriver *drv) +int bdrv_open_image(BlockDriverState **pbs, const char *filename, + QDict *options, const char *bdref_key, int flags, + bool allow_none, Error **errp) { + QDict *image_options; int ret; - /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ - char tmp_filename[PATH_MAX + 1]; - BlockDriverState *file = NULL; - QDict *file_options = NULL; - const char *drvname; + char *bdref_key_dot; + const char *reference; - /* NULL means an empty set of options */ - if (options == NULL) { - options = qdict_new(); - } + assert(pbs); + assert(*pbs == NULL); - bs->options = options; - options = qdict_clone_shallow(options); + bdref_key_dot = g_strdup_printf("%s.", bdref_key); + qdict_extract_subqdict(options, &image_options, bdref_key_dot); + g_free(bdref_key_dot); - /* For snapshot=on, create a temporary qcow2 overlay */ - if (flags & BDRV_O_SNAPSHOT) { - BlockDriverState *bs1; - int64_t total_size; - BlockDriver *bdrv_qcow2; - QEMUOptionParameter *create_options; - char backing_filename[PATH_MAX]; - - if (qdict_size(options) != 0) { - error_report("Can't use snapshot=on with driver-specific options"); + reference = qdict_get_try_str(options, bdref_key); + if (!filename && !reference && !qdict_size(image_options)) { + if (allow_none) { + ret = 0; + } else { + error_setg(errp, "A block device must be specified for \"%s\"", + bdref_key); ret = -EINVAL; - goto fail; } - assert(filename != NULL); + goto done; + } - /* if snapshot, we create a temporary backing file and open it - instead of opening 'filename' directly */ + ret = bdrv_open(pbs, filename, reference, image_options, flags, NULL, errp); - /* if there is a backing file, use it */ - bs1 = bdrv_new(""); - ret = bdrv_open(bs1, filename, NULL, 0, drv); - if (ret < 0) { - bdrv_delete(bs1); - goto fail; - } - total_size = bdrv_getlength(bs1) & BDRV_SECTOR_MASK; +done: + qdict_del(options, bdref_key); + return ret; +} - bdrv_delete(bs1); +void bdrv_append_temp_snapshot(BlockDriverState *bs, int flags, Error **errp) +{ + /* TODO: extra byte is a hack to ensure MAX_PATH space on Windows. */ + char *tmp_filename = g_malloc0(PATH_MAX + 1); + int64_t total_size; + BlockDriver *bdrv_qcow2; + QEMUOptionParameter *create_options; + QDict *snapshot_options; + BlockDriverState *bs_snapshot; + Error *local_err; + int ret; - ret = get_tmp_filename(tmp_filename, sizeof(tmp_filename)); - if (ret < 0) { - goto fail; - } + /* if snapshot, we create a temporary backing file and open it + instead of opening 'filename' directly */ - /* Real path is meaningless for protocols */ - if (path_has_protocol(filename)) { - snprintf(backing_filename, sizeof(backing_filename), - "%s", filename); - } else if (!realpath(filename, backing_filename)) { - ret = -errno; - goto fail; - } + /* Get the required size from the image */ + total_size = bdrv_getlength(bs); + if (total_size < 0) { + error_setg_errno(errp, -total_size, "Could not get image size"); + goto out; + } + total_size &= BDRV_SECTOR_MASK; - bdrv_qcow2 = bdrv_find_format("qcow2"); - create_options = parse_option_parameters("", bdrv_qcow2->create_options, - NULL); + /* Create the temporary image */ + ret = get_tmp_filename(tmp_filename, PATH_MAX + 1); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not get temporary filename"); + goto out; + } - set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size); - set_option_parameter(create_options, BLOCK_OPT_BACKING_FILE, - backing_filename); - if (drv) { - set_option_parameter(create_options, BLOCK_OPT_BACKING_FMT, - drv->format_name); - } + bdrv_qcow2 = bdrv_find_format("qcow2"); + create_options = parse_option_parameters("", bdrv_qcow2->create_options, + NULL); - ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options); - free_option_parameters(create_options); - if (ret < 0) { - goto fail; - } + set_option_parameter_int(create_options, BLOCK_OPT_SIZE, total_size); - filename = tmp_filename; - drv = bdrv_qcow2; - bs->is_temporary = 1; + ret = bdrv_create(bdrv_qcow2, tmp_filename, create_options, &local_err); + free_option_parameters(create_options); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not create temporary overlay " + "'%s': %s", tmp_filename, + error_get_pretty(local_err)); + error_free(local_err); + goto out; } - /* Open image file without format layer */ - if (flags & BDRV_O_RDWR) { - flags |= BDRV_O_ALLOW_RDWR; - } + /* Prepare a new options QDict for the temporary file */ + snapshot_options = qdict_new(); + qdict_put(snapshot_options, "file.driver", + qstring_from_str("file")); + qdict_put(snapshot_options, "file.filename", + qstring_from_str(tmp_filename)); - extract_subqdict(options, &file_options, "file."); + bs_snapshot = bdrv_new("", &error_abort); - ret = bdrv_file_open(&file, filename, file_options, - bdrv_open_flags(bs, flags | BDRV_O_UNMAP)); + ret = bdrv_open(&bs_snapshot, NULL, NULL, snapshot_options, + flags, bdrv_qcow2, &local_err); if (ret < 0) { - goto fail; + error_propagate(errp, local_err); + goto out; } - /* Find the right image format driver */ - drvname = qdict_get_try_str(options, "driver"); - if (drvname) { - drv = bdrv_find_whitelisted_format(drvname, !(flags & BDRV_O_RDWR)); - qdict_del(options, "driver"); - } + bdrv_append(bs_snapshot, bs); - if (!drv) { - ret = find_image_format(file, filename, &drv); - } +out: + g_free(tmp_filename); +} - if (!drv) { - goto unlink_and_fail; - } +/* + * Opens a disk image (raw, qcow2, vmdk, ...) + * + * options is a QDict of options to pass to the block drivers, or NULL for an + * empty set of options. The reference to the QDict belongs to the block layer + * after the call (even on failure), so if the caller intends to reuse the + * dictionary, it needs to use QINCREF() before calling bdrv_open. + * + * If *pbs is NULL, a new BDS will be created with a pointer to it stored there. + * If it is not NULL, the referenced BDS will be reused. + * + * The reference parameter may be used to specify an existing block device which + * should be opened. If specified, neither options nor a filename may be given, + * nor can an existing BDS be reused (that is, *pbs has to be NULL). + */ +int bdrv_open(BlockDriverState **pbs, const char *filename, + const char *reference, QDict *options, int flags, + BlockDriver *drv, Error **errp) +{ + int ret; + BlockDriverState *file = NULL, *bs; + const char *drvname; + Error *local_err = NULL; + int snapshot_flags = 0; + + assert(pbs); + + if (reference) { + bool options_non_empty = options ? qdict_size(options) : false; + QDECREF(options); + + if (*pbs) { + error_setg(errp, "Cannot reuse an existing BDS when referencing " + "another block device"); + return -EINVAL; + } + + if (filename || options_non_empty) { + error_setg(errp, "Cannot reference an existing block device with " + "additional options or a new filename"); + return -EINVAL; + } + + bs = bdrv_lookup_bs(reference, reference, errp); + if (!bs) { + return -ENODEV; + } + bdrv_ref(bs); + *pbs = bs; + return 0; + } + + if (*pbs) { + bs = *pbs; + } else { + bs = bdrv_new("", &error_abort); + } + + /* NULL means an empty set of options */ + if (options == NULL) { + options = qdict_new(); + } + + bs->options = options; + options = qdict_clone_shallow(options); + + if (flags & BDRV_O_PROTOCOL) { + assert(!drv); + ret = bdrv_file_open(bs, filename, &options, flags & ~BDRV_O_PROTOCOL, + &local_err); + if (!ret) { + drv = bs->drv; + goto done; + } else if (bs->drv) { + goto close_and_fail; + } else { + goto fail; + } + } + + /* Open image file without format layer */ + if (flags & BDRV_O_RDWR) { + flags |= BDRV_O_ALLOW_RDWR; + } + if (flags & BDRV_O_SNAPSHOT) { + snapshot_flags = bdrv_temp_snapshot_flags(flags); + flags = bdrv_backing_flags(flags); + } + + assert(file == NULL); + ret = bdrv_open_image(&file, filename, options, "file", + bdrv_inherited_flags(flags), + true, &local_err); + if (ret < 0) { + goto fail; + } + + /* Find the right image format driver */ + drvname = qdict_get_try_str(options, "driver"); + if (drvname) { + drv = bdrv_find_format(drvname); + qdict_del(options, "driver"); + if (!drv) { + error_setg(errp, "Invalid driver: '%s'", drvname); + ret = -EINVAL; + goto fail; + } + } + + if (!drv) { + if (file) { + ret = find_image_format(file, filename, &drv, &local_err); + } else { + error_setg(errp, "Must specify either driver or file"); + ret = -EINVAL; + goto fail; + } + } + + if (!drv) { + goto fail; + } /* Open the image */ - ret = bdrv_open_common(bs, file, options, flags, drv); + ret = bdrv_open_common(bs, file, options, flags, drv, &local_err); if (ret < 0) { - goto unlink_and_fail; + goto fail; } - if (bs->file != file) { - bdrv_delete(file); + if (file && (bs->file != file)) { + bdrv_unref(file); file = NULL; } @@ -1089,52 +1412,85 @@ int bdrv_open(BlockDriverState *bs, const char *filename, QDict *options, if ((flags & BDRV_O_NO_BACKING) == 0) { QDict *backing_options; - extract_subqdict(options, &backing_options, "backing."); - ret = bdrv_open_backing_file(bs, backing_options); + qdict_extract_subqdict(options, &backing_options, "backing."); + ret = bdrv_open_backing_file(bs, backing_options, &local_err); if (ret < 0) { goto close_and_fail; } } + /* For snapshot=on, create a temporary qcow2 overlay. bs points to the + * temporary snapshot afterwards. */ + if (snapshot_flags) { + bdrv_append_temp_snapshot(bs, snapshot_flags, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto close_and_fail; + } + } + + +done: /* Check if any unknown options were used */ - if (qdict_size(options) != 0) { + if (options && (qdict_size(options) != 0)) { const QDictEntry *entry = qdict_first(options); - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Block format '%s' used by " - "device '%s' doesn't support the option '%s'", - drv->format_name, bs->device_name, entry->key); + if (flags & BDRV_O_PROTOCOL) { + error_setg(errp, "Block protocol '%s' doesn't support the option " + "'%s'", drv->format_name, entry->key); + } else { + error_setg(errp, "Block format '%s' used by device '%s' doesn't " + "support the option '%s'", drv->format_name, + bs->device_name, entry->key); + } ret = -EINVAL; goto close_and_fail; } - QDECREF(options); if (!bdrv_key_required(bs)) { bdrv_dev_change_media_cb(bs, true); + } else if (!runstate_check(RUN_STATE_PRELAUNCH) + && !runstate_check(RUN_STATE_INMIGRATE) + && !runstate_check(RUN_STATE_PAUSED)) { /* HACK */ + error_setg(errp, + "Guest must be stopped for opening of encrypted image"); + ret = -EBUSY; + goto close_and_fail; } - /* throttling disk I/O limits */ - if (bs->io_limits_enabled) { - bdrv_io_limits_enable(bs); - } - + QDECREF(options); + *pbs = bs; return 0; -unlink_and_fail: +fail: if (file != NULL) { - bdrv_delete(file); + bdrv_unref(file); } - if (bs->is_temporary) { - unlink(filename); - } -fail: QDECREF(bs->options); QDECREF(options); bs->options = NULL; + if (!*pbs) { + /* If *pbs is NULL, a new BDS has been created in this function and + needs to be freed now. Otherwise, it does not need to be closed, + since it has not really been opened yet. */ + bdrv_unref(bs); + } + if (local_err) { + error_propagate(errp, local_err); + } return ret; close_and_fail: - bdrv_close(bs); + /* See fail path, but now the BDS has to be always closed */ + if (*pbs) { + bdrv_close(bs); + } else { + bdrv_unref(bs); + } QDECREF(options); + if (local_err) { + error_propagate(errp, local_err); + } return ret; } @@ -1173,8 +1529,11 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, QSIMPLEQ_INIT(bs_queue); } + /* bdrv_open() masks this flag out */ + flags &= ~BDRV_O_PROTOCOL; + if (bs->file) { - bdrv_reopen_queue(bs_queue, bs->file, flags); + bdrv_reopen_queue(bs_queue, bs->file, bdrv_inherited_flags(flags)); } bs_entry = g_new0(BlockReopenQueueEntry, 1); @@ -1350,6 +1709,8 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state) reopen_state->bs->enable_write_cache = !!(reopen_state->flags & BDRV_O_CACHE_WB); reopen_state->bs->read_only = !(reopen_state->flags & BDRV_O_RDWR); + + bdrv_refresh_limits(reopen_state->bs); } /* @@ -1382,16 +1743,11 @@ void bdrv_close(BlockDriverState *bs) if (bs->drv) { if (bs->backing_hd) { - bdrv_delete(bs->backing_hd); + bdrv_unref(bs->backing_hd); bs->backing_hd = NULL; } bs->drv->bdrv_close(bs); g_free(bs->opaque); -#ifdef _WIN32 - if (bs->is_temporary) { - unlink(bs->filename); - } -#endif bs->opaque = NULL; bs->drv = NULL; bs->copy_on_read = 0; @@ -1402,11 +1758,12 @@ void bdrv_close(BlockDriverState *bs) bs->valid_key = 0; bs->sg = 0; bs->growable = 0; + bs->zero_beyond_eof = false; QDECREF(bs->options); bs->options = NULL; if (bs->file != NULL) { - bdrv_delete(bs->file); + bdrv_unref(bs->file); bs->file = NULL; } } @@ -1423,11 +1780,43 @@ void bdrv_close_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bdrv_close(bs); } } +/* Check if any requests are in-flight (including throttled requests) */ +static bool bdrv_requests_pending(BlockDriverState *bs) +{ + if (!QLIST_EMPTY(&bs->tracked_requests)) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { + return true; + } + if (bs->file && bdrv_requests_pending(bs->file)) { + return true; + } + if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { + return true; + } + return false; +} + +static bool bdrv_requests_pending_all(void) +{ + BlockDriverState *bs; + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + if (bdrv_requests_pending(bs)) { + return true; + } + } + return false; +} + /* * Wait for pending requests to complete across all BlockDriverStates * @@ -1442,37 +1831,33 @@ void bdrv_close_all(void) */ void bdrv_drain_all(void) { + /* Always run first iteration so any pending completion BHs run */ + bool busy = true; BlockDriverState *bs; - bool busy; - - do { - busy = qemu_aio_wait(); - /* FIXME: We do not have timer support here, so this is effectively - * a busy wait. - */ - QTAILQ_FOREACH(bs, &bdrv_states, list) { - while (qemu_co_enter_next(&bs->throttled_reqs)) { - busy = true; - } + while (busy) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + bdrv_start_throttled_reqs(bs); } - } while (busy); - /* If requests are still pending there is a bug somewhere */ - QTAILQ_FOREACH(bs, &bdrv_states, list) { - assert(QLIST_EMPTY(&bs->tracked_requests)); - assert(qemu_co_queue_empty(&bs->throttled_reqs)); + busy = bdrv_requests_pending_all(); + busy |= aio_poll(qemu_get_aio_context(), busy); } } -/* make a BlockDriverState anonymous by removing from bdrv_state list. +/* make a BlockDriverState anonymous by removing from bdrv_state and + * graph_bdrv_state list. Also, NULL terminate the device_name to prevent double remove */ void bdrv_make_anon(BlockDriverState *bs) { if (bs->device_name[0] != '\0') { - QTAILQ_REMOVE(&bdrv_states, bs, list); + QTAILQ_REMOVE(&bdrv_states, bs, device_list); } bs->device_name[0] = '\0'; + if (bs->node_name[0] != '\0') { + QTAILQ_REMOVE(&graph_bdrv_states, bs, node_list); + } + bs->node_name[0] = '\0'; } static void bdrv_rebind(BlockDriverState *bs) @@ -1486,24 +1871,22 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, BlockDriverState *bs_src) { /* move some fields that need to stay attached to the device */ - bs_dest->open_flags = bs_src->open_flags; /* dev info */ bs_dest->dev_ops = bs_src->dev_ops; bs_dest->dev_opaque = bs_src->dev_opaque; bs_dest->dev = bs_src->dev; - bs_dest->buffer_alignment = bs_src->buffer_alignment; + bs_dest->guest_block_size = bs_src->guest_block_size; bs_dest->copy_on_read = bs_src->copy_on_read; bs_dest->enable_write_cache = bs_src->enable_write_cache; - /* i/o timing parameters */ - bs_dest->slice_start = bs_src->slice_start; - bs_dest->slice_end = bs_src->slice_end; - bs_dest->slice_submitted = bs_src->slice_submitted; - bs_dest->io_limits = bs_src->io_limits; - bs_dest->throttled_reqs = bs_src->throttled_reqs; - bs_dest->block_timer = bs_src->block_timer; + /* i/o throttled req */ + memcpy(&bs_dest->throttle_state, + &bs_src->throttle_state, + sizeof(ThrottleState)); + bs_dest->throttled_reqs[0] = bs_src->throttled_reqs[0]; + bs_dest->throttled_reqs[1] = bs_src->throttled_reqs[1]; bs_dest->io_limits_enabled = bs_src->io_limits_enabled; /* r/w error */ @@ -1515,7 +1898,10 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, bs_dest->iostatus = bs_src->iostatus; /* dirty bitmap */ - bs_dest->dirty_bitmap = bs_src->dirty_bitmap; + bs_dest->dirty_bitmaps = bs_src->dirty_bitmaps; + + /* reference count */ + bs_dest->refcnt = bs_src->refcnt; /* job */ bs_dest->in_use = bs_src->in_use; @@ -1524,7 +1910,7 @@ static void bdrv_move_feature_fields(BlockDriverState *bs_dest, /* keep the same entry in bdrv_states */ pstrcpy(bs_dest->device_name, sizeof(bs_dest->device_name), bs_src->device_name); - bs_dest->list = bs_src->list; + bs_dest->device_list = bs_src->device_list; } /* @@ -1543,14 +1929,25 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) { BlockDriverState tmp; + /* The code needs to swap the node_name but simply swapping node_list won't + * work so first remove the nodes from the graph list, do the swap then + * insert them back if needed. + */ + if (bs_new->node_name[0] != '\0') { + QTAILQ_REMOVE(&graph_bdrv_states, bs_new, node_list); + } + if (bs_old->node_name[0] != '\0') { + QTAILQ_REMOVE(&graph_bdrv_states, bs_old, node_list); + } + /* bs_new must be anonymous and shouldn't have anything fancy enabled */ assert(bs_new->device_name[0] == '\0'); - assert(bs_new->dirty_bitmap == NULL); + assert(QLIST_EMPTY(&bs_new->dirty_bitmaps)); assert(bs_new->job == NULL); assert(bs_new->dev == NULL); assert(bs_new->in_use == 0); assert(bs_new->io_limits_enabled == false); - assert(bs_new->block_timer == NULL); + assert(!throttle_have_timer(&bs_new->throttle_state)); tmp = *bs_new; *bs_new = *bs_old; @@ -1569,7 +1966,15 @@ void bdrv_swap(BlockDriverState *bs_new, BlockDriverState *bs_old) assert(bs_new->job == NULL); assert(bs_new->in_use == 0); assert(bs_new->io_limits_enabled == false); - assert(bs_new->block_timer == NULL); + assert(!throttle_have_timer(&bs_new->throttle_state)); + + /* insert the nodes back into the graph node list if needed */ + if (bs_new->node_name[0] != '\0') { + QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_new, node_list); + } + if (bs_old->node_name[0] != '\0') { + QTAILQ_INSERT_TAIL(&graph_bdrv_states, bs_old, node_list); + } bdrv_rebind(bs_new); bdrv_rebind(bs_old); @@ -1600,17 +2005,19 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) bs_new->drv ? bs_new->drv->format_name : ""); } -void bdrv_delete(BlockDriverState *bs) +static void bdrv_delete(BlockDriverState *bs) { assert(!bs->dev); assert(!bs->job); assert(!bs->in_use); + assert(!bs->refcnt); + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); + + bdrv_close(bs); /* remove from list, if necessary */ bdrv_make_anon(bs); - bdrv_close(bs); - g_free(bs); } @@ -1640,7 +2047,7 @@ void bdrv_detach_dev(BlockDriverState *bs, void *dev) bs->dev = NULL; bs->dev_ops = NULL; bs->dev_opaque = NULL; - bs->buffer_alignment = 512; + bs->guest_block_size = 512; } /* TODO change to return DeviceState * when all users are qdevified */ @@ -1771,10 +2178,10 @@ int bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) int bdrv_commit(BlockDriverState *bs) { BlockDriver *drv = bs->drv; - int64_t sector, total_sectors; + int64_t sector, total_sectors, length, backing_length; int n, ro, open_flags; int ret = 0; - uint8_t *buf; + uint8_t *buf = NULL; char filename[PATH_MAX]; if (!drv) @@ -1799,19 +2206,44 @@ int bdrv_commit(BlockDriverState *bs) } } - total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; + length = bdrv_getlength(bs); + if (length < 0) { + ret = length; + goto ro_cleanup; + } + + backing_length = bdrv_getlength(bs->backing_hd); + if (backing_length < 0) { + ret = backing_length; + goto ro_cleanup; + } + + /* If our top snapshot is larger than the backing file image, + * grow the backing file image if possible. If not possible, + * we must return an error */ + if (length > backing_length) { + ret = bdrv_truncate(bs->backing_hd, length); + if (ret < 0) { + goto ro_cleanup; + } + } + + total_sectors = length >> BDRV_SECTOR_BITS; buf = g_malloc(COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE); for (sector = 0; sector < total_sectors; sector += n) { - if (bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n)) { - - if (bdrv_read(bs, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n); + if (ret < 0) { + goto ro_cleanup; + } + if (ret) { + ret = bdrv_read(bs, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } - if (bdrv_write(bs->backing_hd, sector, buf, n) != 0) { - ret = -EIO; + ret = bdrv_write(bs->backing_hd, sector, buf, n); + if (ret < 0) { goto ro_cleanup; } } @@ -1819,6 +2251,9 @@ int bdrv_commit(BlockDriverState *bs) if (drv->bdrv_make_empty) { ret = drv->bdrv_make_empty(bs); + if (ret < 0) { + goto ro_cleanup; + } bdrv_flush(bs); } @@ -1826,9 +2261,11 @@ int bdrv_commit(BlockDriverState *bs) * Make sure all data we wrote to the backing device is actually * stable on disk. */ - if (bs->backing_hd) + if (bs->backing_hd) { bdrv_flush(bs->backing_hd); + } + ret = 0; ro_cleanup: g_free(buf); @@ -1844,7 +2281,7 @@ int bdrv_commit_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (bs->drv && bs->backing_hd) { int ret = bdrv_commit(bs); if (ret < 0) { @@ -1862,6 +2299,10 @@ int bdrv_commit_all(void) */ static void tracked_request_end(BdrvTrackedRequest *req) { + if (req->serialising) { + req->bs->serialising_in_flight--; + } + QLIST_REMOVE(req, list); qemu_co_queue_restart_all(&req->wait_queue); } @@ -1871,15 +2312,18 @@ static void tracked_request_end(BdrvTrackedRequest *req) */ static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, bool is_write) + int64_t offset, + unsigned int bytes, bool is_write) { *req = (BdrvTrackedRequest){ .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .is_write = is_write, - .co = qemu_coroutine_self(), + .offset = offset, + .bytes = bytes, + .is_write = is_write, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, }; qemu_co_queue_init(&req->wait_queue); @@ -1887,6 +2331,21 @@ static void tracked_request_begin(BdrvTrackedRequest *req, QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); } +static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + /** * Round a region to cluster boundaries */ @@ -1908,53 +2367,75 @@ void bdrv_round_to_clusters(BlockDriverState *bs, } } +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t sector_num, int nb_sectors) { + int64_t offset, unsigned int bytes) +{ /* aaaa bbbb */ - if (sector_num >= req->sector_num + req->nb_sectors) { + if (offset >= req->overlap_offset + req->overlap_bytes) { return false; } /* bbbb aaaa */ - if (req->sector_num >= sector_num + nb_sectors) { + if (req->overlap_offset >= offset + bytes) { return false; } return true; } -static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) { + BlockDriverState *bs = self->bs; BdrvTrackedRequest *req; - int64_t cluster_sector_num; - int cluster_nb_sectors; bool retry; + bool waited = false; - /* If we touch the same cluster it counts as an overlap. This guarantees - * that allocating writes will be serialized and not race with each other - * for the same cluster. For example, in copy-on-read it ensures that the - * CoR read and write operations are atomic and guest writes cannot - * interleave between them. - */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); + if (!bs->serialising_in_flight) { + return false; + } do { retry = false; QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (tracked_request_overlaps(req, cluster_sector_num, - cluster_nb_sectors)) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { /* Hitting this means there was a reentrant request, for * example, a block driver issuing nested requests. This must * never happen since it means deadlock. */ assert(qemu_coroutine_self() != req->co); - qemu_co_queue_wait(&req->wait_queue); - retry = true; - break; + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } } } } while (retry); + + return waited; } /* @@ -2116,11 +2597,12 @@ int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, } new_top_bs->backing_hd = base_bs; + bdrv_refresh_limits(new_top_bs); QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) { /* so that bdrv_close() does not recursively close the chain */ intermediate_state->bs->backing_hd = NULL; - bdrv_delete(intermediate_state->bs); + bdrv_unref(intermediate_state->bs); } ret = 0; @@ -2137,6 +2619,10 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, { int64_t len; + if (size > INT_MAX) { + return -EIO; + } + if (!bdrv_is_inserted(bs)) return -ENOMEDIUM; @@ -2157,14 +2643,17 @@ static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { + if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { + return -EIO; + } + return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, nb_sectors * BDRV_SECTOR_SIZE); } typedef struct RwCo { BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; + int64_t offset; QEMUIOVector *qiov; bool is_write; int ret; @@ -2176,34 +2665,32 @@ static void coroutine_fn bdrv_rw_co_entry(void *opaque) RwCo *rwco = opaque; if (!rwco->is_write) { - rwco->ret = bdrv_co_do_readv(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_writev(rwco->bs, rwco->sector_num, - rwco->nb_sectors, rwco->qiov, + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); } } /* * Process a vectored synchronous request using coroutines */ -static int bdrv_rwv_co(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) { Coroutine *co; RwCo rwco = { .bs = bs, - .sector_num = sector_num, - .nb_sectors = qiov->size >> BDRV_SECTOR_BITS, + .offset = offset, .qiov = qiov, .is_write = is_write, .ret = NOT_DONE, .flags = flags, }; - assert((qiov->size & (BDRV_SECTOR_SIZE - 1)) == 0); /** * In sync call context, when the vcpu is blocked, this throttling timer @@ -2241,8 +2728,13 @@ static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, .iov_len = nb_sectors * BDRV_SECTOR_SIZE, }; + if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { + return -EINVAL; + } + qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_rwv_co(bs, sector_num, &qiov, is_write, flags); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); } /* return < 0 if error. See bdrv_write() for the return codes */ @@ -2278,128 +2770,108 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); } -int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) { - return bdrv_rwv_co(bs, sector_num, qiov, true, 0); + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE | flags); } -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) { - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE); + int64_t target_size; + int64_t ret, nb_sectors, sector_num = 0; + int n; + + target_size = bdrv_getlength(bs); + if (target_size < 0) { + return target_size; + } + target_size /= BDRV_SECTOR_SIZE; + + for (;;) { + nb_sectors = target_size - sector_num; + if (nb_sectors <= 0) { + return 0; + } + if (nb_sectors > INT_MAX) { + nb_sectors = INT_MAX; + } + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + error_report("error getting block status at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + if (ret & BDRV_BLOCK_ZERO) { + sector_num += n; + continue; + } + ret = bdrv_write_zeroes(bs, sector_num, n, flags); + if (ret < 0) { + error_report("error writing zeroes at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + sector_num += n; + } } -int bdrv_pread(BlockDriverState *bs, int64_t offset, - void *buf, int count1) +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; int ret; - count = count1; - /* first read to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), len); - count -= len; - if (count == 0) - return count1; - sector_num++; - buf += len; - } - - /* read the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - if ((ret = bdrv_read(bs, sector_num, buf, nb_sectors)) < 0) - return ret; - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - buf += len; - count -= len; + if (bytes < 0) { + return -EINVAL; } - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - memcpy(buf, tmp_buf, count); + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; } - return count1; + + return bytes; } int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) { - uint8_t tmp_buf[BDRV_SECTOR_SIZE]; - int len, nb_sectors, count; - int64_t sector_num; int ret; - count = qiov->size; - - /* first write to align to sector start */ - len = (BDRV_SECTOR_SIZE - offset) & (BDRV_SECTOR_SIZE - 1); - if (len > count) - len = count; - sector_num = offset >> BDRV_SECTOR_BITS; - if (len > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, 0, tmp_buf + (offset & (BDRV_SECTOR_SIZE - 1)), - len); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - count -= len; - if (count == 0) - return qiov->size; - sector_num++; - } - - /* write the sectors "in place" */ - nb_sectors = count >> BDRV_SECTOR_BITS; - if (nb_sectors > 0) { - QEMUIOVector qiov_inplace; - - qemu_iovec_init(&qiov_inplace, qiov->niov); - qemu_iovec_concat(&qiov_inplace, qiov, len, - nb_sectors << BDRV_SECTOR_BITS); - ret = bdrv_writev(bs, sector_num, &qiov_inplace); - qemu_iovec_destroy(&qiov_inplace); - if (ret < 0) { - return ret; - } - - sector_num += nb_sectors; - len = nb_sectors << BDRV_SECTOR_BITS; - count -= len; + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; } - /* add data from the last sector */ - if (count > 0) { - if ((ret = bdrv_read(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - qemu_iovec_to_buf(qiov, qiov->size - count, tmp_buf, count); - if ((ret = bdrv_write(bs, sector_num, tmp_buf, 1)) < 0) - return ret; - } return qiov->size; } int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int count1) + const void *buf, int bytes) { QEMUIOVector qiov; struct iovec iov = { .iov_base = (void *) buf, - .iov_len = count1, + .iov_len = bytes, }; + if (bytes < 0) { + return -EINVAL; + } + qemu_iovec_init_external(&qiov, &iov, 1); return bdrv_pwritev(bs, offset, &qiov); } @@ -2468,7 +2940,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, if (drv->bdrv_co_write_zeroes && buffer_is_zero(bounce_buffer, iov.iov_len)) { ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors); + cluster_nb_sectors, 0); } else { /* This does not change the data on the disk, it is not necessary * to flush even in cache=writethrough mode. @@ -2495,45 +2967,39 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, } /* - * Handle a read request in coroutine context + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. */ -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) { BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; int ret; - if (!drv) { - return -ENOMEDIUM; - } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { - return -EIO; - } + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - /* throttling disk read I/O */ - if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, false, nb_sectors); - } + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - if (bs->copy_on_read) { - flags |= BDRV_REQ_COPY_ON_READ; - } + /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight++; - } - - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); } - tracked_request_begin(&req, bs, sector_num, nb_sectors, false); + wait_serialising_requests(req); if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; - ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum); + ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); if (ret < 0) { goto out; } @@ -2544,18 +3010,128 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, } } - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + /* Forward the request to the BlockDriver */ + if (!(bs->zero_beyond_eof && bs->growable)) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + /* Read zeros after EOF of growable BDSes */ + int64_t len, total_sectors, max_nb_sectors; + + len = bdrv_getlength(bs); + if (len < 0) { + ret = len; + goto out; + } + + total_sectors = DIV_ROUND_UP(len, BDRV_SECTOR_SIZE); + max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), + align >> BDRV_SECTOR_BITS); + if (max_nb_sectors > 0) { + ret = drv->bdrv_co_readv(bs, sector_num, + MIN(nb_sectors, max_nb_sectors), qiov); + } else { + ret = 0; + } + + /* Reading beyond end of file is supposed to produce zeroes */ + if (ret == 0 && total_sectors < sector_num + nb_sectors) { + uint64_t offset = MAX(0, total_sectors - sector_num); + uint64_t bytes = (sector_num + nb_sectors - offset) * + BDRV_SECTOR_SIZE; + qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + } + } out: + return ret; +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (bdrv_check_byte_request(bs, offset, bytes)) { + return -EIO; + } + + if (bs->copy_on_read) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + bdrv_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, false); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); tracked_request_end(&req); - if (flags & BDRV_REQ_COPY_ON_READ) { - bs->copy_on_read_in_flight--; + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); } return ret; } +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (UINT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -2573,46 +3149,146 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, BDRV_REQ_COPY_ON_READ); } +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_WRITE_ZEROES_DEFAULT 32768 + static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; QEMUIOVector qiov; - struct iovec iov; - int ret; + struct iovec iov = {0}; + int ret = 0; - /* TODO Emulate only part of misaligned requests instead of letting block - * drivers return -ENOTSUP and emulate everything */ + int max_write_zeroes = bs->bl.max_write_zeroes ? + bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; - /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, nb_sectors); - if (ret != -ENOTSUP) { - return ret; + while (nb_sectors > 0 && !ret) { + int num = nb_sectors; + + /* Align request. Block drivers can expect the "bulk" of the request + * to be aligned. + */ + if (bs->bl.write_zeroes_alignment + && num > bs->bl.write_zeroes_alignment) { + if (sector_num % bs->bl.write_zeroes_alignment != 0) { + /* Make a small request up to the first aligned sector. */ + num = bs->bl.write_zeroes_alignment; + num -= sector_num % bs->bl.write_zeroes_alignment; + } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { + /* Shorten the request to the last aligned sector. num cannot + * underflow because num > bs->bl.write_zeroes_alignment. + */ + num -= (sector_num + num) % bs->bl.write_zeroes_alignment; + } } - } - /* Fall back to bounce buffer if write zeroes is unsupported */ - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = qemu_blockalign(bs, iov.iov_len); - memset(iov.iov_base, 0, iov.iov_len); - qemu_iovec_init_external(&qiov, &iov, 1); + /* limit request size */ + if (num > max_write_zeroes) { + num = max_write_zeroes; + } + + ret = -ENOTSUP; + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + } - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, &qiov); + if (ret == -ENOTSUP) { + /* Fall back to bounce buffer if write zeroes is unsupported */ + iov.iov_len = num * BDRV_SECTOR_SIZE; + if (iov.iov_base == NULL) { + iov.iov_base = qemu_blockalign(bs, num * BDRV_SECTOR_SIZE); + memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + } + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + + /* Keep bounce buffer around if it is big enough for all + * all future requests. + */ + if (num < max_write_zeroes) { + qemu_vfree(iov.iov_base); + iov.iov_base = NULL; + } + } + + sector_num += num; + nb_sectors -= num; + } qemu_vfree(iov.iov_base); return ret; } +/* + * Forwards an already correctly aligned write request to the BlockDriver. + */ +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + bool waited; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + + if (ret < 0) { + /* Do nothing, write notifier decided to fail this request */ + } else if (flags & BDRV_REQ_ZERO_WRITE) { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else { + BLKDBG_EVENT(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + + bdrv_set_dirty(bs, sector_num, nb_sectors); + + if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { + bs->wr_highest_sector = sector_num + nb_sectors - 1; + } + if (bs->growable && ret >= 0) { + bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + } + + return ret; +} + /* * Handle a write request in coroutine context */ -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { - BlockDriver *drv = bs->drv; BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; int ret; if (!bs->drv) { @@ -2621,48 +3297,118 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, if (bs->read_only) { return -EACCES; } - if (bdrv_check_request(bs, sector_num, nb_sectors)) { + if (bdrv_check_byte_request(bs, offset, bytes)) { return -EIO; } - /* throttling disk write I/O */ + /* throttling disk I/O */ if (bs->io_limits_enabled) { - bdrv_io_limits_intercept(bs, true, nb_sectors); + bdrv_io_limits_intercept(bs, bytes, true); } - if (bs->copy_on_read_in_flight) { - wait_for_overlapping_requests(bs, sector_num, nb_sectors); - } + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, true); - tracked_request_begin(&req, bs, sector_num, nb_sectors, true); + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req); + mark_request_serialising(&req, align); + wait_serialising_requests(&req); - if (ret < 0) { - /* Do nothing, write notifier decided to fail this request */ - } else if (flags & BDRV_REQ_ZERO_WRITE) { - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); - } else { - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); - if (ret == 0 && !bs->enable_write_cache) { - ret = bdrv_co_flush(bs); - } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); - if (bs->dirty_bitmap) { - bdrv_set_dirty(bs, sector_num, nb_sectors); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); } - if (bs->wr_highest_sector < sector_num + nb_sectors - 1) { - bs->wr_highest_sector = sector_num + nb_sectors - 1; + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); } + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: tracked_request_end(&req); + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + } + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + return ret; } +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > (INT_MAX >> BDRV_SECTOR_BITS)) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -2672,12 +3418,17 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, } int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) { - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE); + BDRV_REQ_ZERO_WRITE | flags); } /** @@ -2731,9 +3482,10 @@ int64_t bdrv_getlength(BlockDriverState *bs) if (!drv) return -ENOMEDIUM; - if (bs->growable || bdrv_dev_has_removable_media(bs)) { - if (drv->bdrv_getlength) { - return drv->bdrv_getlength(bs); + if (drv->has_variable_length) { + int ret = refresh_total_sectors(bs, bs->total_sectors); + if (ret < 0) { + return ret; } } return bs->total_sectors * BDRV_SECTOR_SIZE; @@ -2751,14 +3503,6 @@ void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr) *nb_sectors_ptr = length; } -/* throttling disk io limits */ -void bdrv_set_io_limits(BlockDriverState *bs, - BlockIOLimit *io_limits) -{ - bs->io_limits = *io_limits; - bs->io_limits_enabled = bdrv_io_limits_enabled(bs); -} - void bdrv_set_on_error(BlockDriverState *bs, BlockdevOnError on_read_error, BlockdevOnError on_write_error) { @@ -2882,17 +3626,33 @@ void bdrv_iterate_format(void (*it)(void *opaque, const char *name), void *opaque) { BlockDriver *drv; + int count = 0; + const char **formats = NULL; QLIST_FOREACH(drv, &bdrv_drivers, list) { - it(opaque, drv->format_name); + if (drv->format_name) { + bool found = false; + int i = count; + while (formats && i && !found) { + found = !strcmp(formats[--i], drv->format_name); + } + + if (!found) { + formats = g_realloc(formats, (count + 1) * sizeof(char *)); + formats[count++] = drv->format_name; + it(opaque, drv->format_name); + } + } } + g_free(formats); } +/* This function is to find block backend bs */ BlockDriverState *bdrv_find(const char *name) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { if (!strcmp(name, bs->device_name)) { return bs; } @@ -2900,19 +3660,79 @@ BlockDriverState *bdrv_find(const char *name) return NULL; } +/* This function is to find a node in the bs graph */ +BlockDriverState *bdrv_find_node(const char *node_name) +{ + BlockDriverState *bs; + + assert(node_name); + + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + if (!strcmp(node_name, bs->node_name)) { + return bs; + } + } + return NULL; +} + +/* Put this QMP function here so it can access the static graph_bdrv_states. */ +BlockDeviceInfoList *bdrv_named_nodes_list(void) +{ + BlockDeviceInfoList *list, *entry; + BlockDriverState *bs; + + list = NULL; + QTAILQ_FOREACH(bs, &graph_bdrv_states, node_list) { + entry = g_malloc0(sizeof(*entry)); + entry->value = bdrv_block_device_info(bs); + entry->next = list; + list = entry; + } + + return list; +} + +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp) +{ + BlockDriverState *bs = NULL; + + if (device) { + bs = bdrv_find(device); + + if (bs) { + return bs; + } + } + + if (node_name) { + bs = bdrv_find_node(node_name); + + if (bs) { + return bs; + } + } + + error_setg(errp, "Cannot find device=%s nor node_name=%s", + device ? device : "", + node_name ? node_name : ""); + return NULL; +} + BlockDriverState *bdrv_next(BlockDriverState *bs) { if (!bs) { return QTAILQ_FIRST(&bdrv_states); } - return QTAILQ_NEXT(bs, list); + return QTAILQ_NEXT(bs, device_list); } void bdrv_iterate(void (*it)(void *opaque, BlockDriverState *bs), void *opaque) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { it(opaque, bs); } } @@ -2932,7 +3752,7 @@ int bdrv_flush_all(void) BlockDriverState *bs; int result = 0; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { int ret = bdrv_flush(bs); if (ret < 0 && !result) { result = ret; @@ -2951,6 +3771,11 @@ int bdrv_has_zero_init(BlockDriverState *bs) { assert(bs->drv); + /* If BS is a copy on write image, it is initialized to + the contents of the base image, which may not be zeroes. */ + if (bs->backing_hd) { + return 0; + } if (bs->drv->bdrv_has_zero_init) { return bs->drv->bdrv_has_zero_init(bs); } @@ -2959,15 +3784,45 @@ int bdrv_has_zero_init(BlockDriverState *bs) return 0; } -typedef struct BdrvCoIsAllocatedData { +bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + + if (bs->backing_hd) { + return false; + } + + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.unallocated_blocks_are_zero; + } + + return false; +} + +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + + if (bs->backing_hd || !(bs->open_flags & BDRV_O_UNMAP)) { + return false; + } + + if (bdrv_get_info(bs, &bdi) == 0) { + return bdi.can_write_zeroes_with_unmap; + } + + return false; +} + +typedef struct BdrvCoGetBlockStatusData { BlockDriverState *bs; BlockDriverState *base; int64_t sector_num; int nb_sectors; int *pnum; - int ret; + int64_t ret; bool done; -} BdrvCoIsAllocatedData; +} BdrvCoGetBlockStatusData; /* * Returns true iff the specified sector is present in the disk image. Drivers @@ -2984,12 +3839,20 @@ typedef struct BdrvCoIsAllocatedData { * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes * beyond the end of the disk image it will be clamped. */ -int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) +static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) { + int64_t length; int64_t n; + int64_t ret, ret2; + + length = bdrv_getlength(bs); + if (length < 0) { + return length; + } - if (sector_num >= bs->total_sectors) { + if (sector_num >= (length >> BDRV_SECTOR_BITS)) { *pnum = 0; return 0; } @@ -2999,35 +3862,76 @@ int coroutine_fn bdrv_co_is_allocated(BlockDriverState *bs, int64_t sector_num, nb_sectors = n; } - if (!bs->drv->bdrv_co_is_allocated) { + if (!bs->drv->bdrv_co_get_block_status) { *pnum = nb_sectors; - return 1; + ret = BDRV_BLOCK_DATA; + if (bs->drv->protocol_name) { + ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); + } + return ret; + } + + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + *pnum = 0; + return ret; + } + + if (ret & BDRV_BLOCK_RAW) { + assert(ret & BDRV_BLOCK_OFFSET_VALID); + return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, pnum); + } + + if (!(ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO)) { + if (bdrv_unallocated_blocks_are_zero(bs)) { + ret |= BDRV_BLOCK_ZERO; + } else if (bs->backing_hd) { + BlockDriverState *bs2 = bs->backing_hd; + int64_t length2 = bdrv_getlength(bs2); + if (length2 >= 0 && sector_num >= (length2 >> BDRV_SECTOR_BITS)) { + ret |= BDRV_BLOCK_ZERO; + } + } + } + + if (bs->file && + (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && + (ret & BDRV_BLOCK_OFFSET_VALID)) { + ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, + *pnum, pnum); + if (ret2 >= 0) { + /* Ignore errors. This is just providing extra information, it + * is useful but not necessary. + */ + ret |= (ret2 & BDRV_BLOCK_ZERO); + } } - return bs->drv->bdrv_co_is_allocated(bs, sector_num, nb_sectors, pnum); + return ret; } -/* Coroutine wrapper for bdrv_is_allocated() */ -static void coroutine_fn bdrv_is_allocated_co_entry(void *opaque) +/* Coroutine wrapper for bdrv_get_block_status() */ +static void coroutine_fn bdrv_get_block_status_co_entry(void *opaque) { - BdrvCoIsAllocatedData *data = opaque; + BdrvCoGetBlockStatusData *data = opaque; BlockDriverState *bs = data->bs; - data->ret = bdrv_co_is_allocated(bs, data->sector_num, data->nb_sectors, - data->pnum); + data->ret = bdrv_co_get_block_status(bs, data->sector_num, data->nb_sectors, + data->pnum); data->done = true; } /* - * Synchronous wrapper around bdrv_co_is_allocated(). + * Synchronous wrapper around bdrv_co_get_block_status(). * - * See bdrv_co_is_allocated() for details. + * See bdrv_co_get_block_status() for details. */ -int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum) +int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) { Coroutine *co; - BdrvCoIsAllocatedData data = { + BdrvCoGetBlockStatusData data = { .bs = bs, .sector_num = sector_num, .nb_sectors = nb_sectors, @@ -3035,14 +3939,31 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, .done = false, }; - co = qemu_coroutine_create(bdrv_is_allocated_co_entry); - qemu_coroutine_enter(co, &data); - while (!data.done) { - qemu_aio_wait(); + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_get_block_status_co_entry(&data); + } else { + co = qemu_coroutine_create(bdrv_get_block_status_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + qemu_aio_wait(); + } } return data.ret; } +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + return ret; + } + return + (ret & BDRV_BLOCK_DATA) || + ((ret & BDRV_BLOCK_ZERO) && !bdrv_has_zero_init(bs)); +} + /* * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] * @@ -3055,10 +3976,10 @@ int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, * allocated/unallocated state. * */ -int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum) +int bdrv_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) { BlockDriverState *intermediate; int ret, n = nb_sectors; @@ -3066,8 +3987,8 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, intermediate = top; while (intermediate && intermediate != base) { int pnum_inter; - ret = bdrv_co_is_allocated(intermediate, sector_num, nb_sectors, - &pnum_inter); + ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); if (ret < 0) { return ret; } else if (ret) { @@ -3094,44 +4015,6 @@ int coroutine_fn bdrv_co_is_allocated_above(BlockDriverState *top, return 0; } -/* Coroutine wrapper for bdrv_is_allocated_above() */ -static void coroutine_fn bdrv_is_allocated_above_co_entry(void *opaque) -{ - BdrvCoIsAllocatedData *data = opaque; - BlockDriverState *top = data->bs; - BlockDriverState *base = data->base; - - data->ret = bdrv_co_is_allocated_above(top, base, data->sector_num, - data->nb_sectors, data->pnum); - data->done = true; -} - -/* - * Synchronous wrapper around bdrv_co_is_allocated_above(). - * - * See bdrv_co_is_allocated_above() for details. - */ -int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, - int64_t sector_num, int nb_sectors, int *pnum) -{ - Coroutine *co; - BdrvCoIsAllocatedData data = { - .bs = top, - .base = base, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .pnum = pnum, - .done = false, - }; - - co = qemu_coroutine_create(bdrv_is_allocated_above_co_entry); - qemu_coroutine_enter(co, &data); - while (!data.done) { - qemu_aio_wait(); - } - return data.ret; -} - const char *bdrv_get_encrypted_filename(BlockDriverState *bs) { if (bs->backing_hd && bs->backing_hd->encrypted) @@ -3159,7 +4042,7 @@ int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, if (bdrv_check_request(bs, sector_num, nb_sectors)) return -EIO; - assert(!bs->dirty_bitmap); + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); } @@ -3175,6 +4058,15 @@ int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) return drv->bdrv_get_info(bs, bdi); } +ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_get_specific_info) { + return drv->bdrv_get_specific_info(bs); + } + return NULL; +} + int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, int64_t pos, int size) { @@ -3239,9 +4131,22 @@ int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, return -ENOTSUP; } +int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag) +{ + while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) { + bs = bs->file; + } + + if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) { + return bs->drv->bdrv_debug_remove_breakpoint(bs, tag); + } + + return -ENOTSUP; +} + int bdrv_debug_resume(BlockDriverState *bs, const char *tag) { - while (bs && bs->drv && !bs->drv->bdrv_debug_resume) { + while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) { bs = bs->file; } @@ -3374,7 +4279,7 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, { trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, cb, opaque, false); } @@ -3384,7 +4289,18 @@ BlockDriverAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, { trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, true); +} + +BlockDriverAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, + BlockDriverCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, + BDRV_REQ_ZERO_WRITE | flags, cb, opaque, true); } @@ -3556,8 +4472,10 @@ int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) /* Run the aio requests. */ mcb->num_requests = num_reqs; for (i = 0; i < num_reqs; i++) { - bdrv_aio_writev(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, multiwrite_cb, mcb); + bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, reqs[i].flags, + multiwrite_cb, mcb, + true); } return 0; @@ -3568,169 +4486,6 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb) acb->aiocb_info->cancel(acb); } -/* block I/O throttling */ -static bool bdrv_exceed_bps_limits(BlockDriverState *bs, int nb_sectors, - bool is_write, double elapsed_time, uint64_t *wait) -{ - uint64_t bps_limit = 0; - uint64_t extension; - double bytes_limit, bytes_base, bytes_res; - double slice_time, wait_time; - - if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { - bps_limit = bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; - } else if (bs->io_limits.bps[is_write]) { - bps_limit = bs->io_limits.bps[is_write]; - } else { - if (wait) { - *wait = 0; - } - - return false; - } - - slice_time = bs->slice_end - bs->slice_start; - slice_time /= (NANOSECONDS_PER_SECOND); - bytes_limit = bps_limit * slice_time; - bytes_base = bs->slice_submitted.bytes[is_write]; - if (bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]) { - bytes_base += bs->slice_submitted.bytes[!is_write]; - } - - /* bytes_base: the bytes of data which have been read/written; and - * it is obtained from the history statistic info. - * bytes_res: the remaining bytes of data which need to be read/written. - * (bytes_base + bytes_res) / bps_limit: used to calcuate - * the total time for completing reading/writting all data. - */ - bytes_res = (unsigned) nb_sectors * BDRV_SECTOR_SIZE; - - if (bytes_base + bytes_res <= bytes_limit) { - if (wait) { - *wait = 0; - } - - return false; - } - - /* Calc approx time to dispatch */ - wait_time = (bytes_base + bytes_res) / bps_limit - elapsed_time; - - /* When the I/O rate at runtime exceeds the limits, - * bs->slice_end need to be extended in order that the current statistic - * info can be kept until the timer fire, so it is increased and tuned - * based on the result of experiment. - */ - extension = wait_time * NANOSECONDS_PER_SECOND; - extension = DIV_ROUND_UP(extension, BLOCK_IO_SLICE_TIME) * - BLOCK_IO_SLICE_TIME; - bs->slice_end += extension; - if (wait) { - *wait = wait_time * NANOSECONDS_PER_SECOND; - } - - return true; -} - -static bool bdrv_exceed_iops_limits(BlockDriverState *bs, bool is_write, - double elapsed_time, uint64_t *wait) -{ - uint64_t iops_limit = 0; - double ios_limit, ios_base; - double slice_time, wait_time; - - if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { - iops_limit = bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; - } else if (bs->io_limits.iops[is_write]) { - iops_limit = bs->io_limits.iops[is_write]; - } else { - if (wait) { - *wait = 0; - } - - return false; - } - - slice_time = bs->slice_end - bs->slice_start; - slice_time /= (NANOSECONDS_PER_SECOND); - ios_limit = iops_limit * slice_time; - ios_base = bs->slice_submitted.ios[is_write]; - if (bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]) { - ios_base += bs->slice_submitted.ios[!is_write]; - } - - if (ios_base + 1 <= ios_limit) { - if (wait) { - *wait = 0; - } - - return false; - } - - /* Calc approx time to dispatch, in seconds */ - wait_time = (ios_base + 1) / iops_limit; - if (wait_time > elapsed_time) { - wait_time = wait_time - elapsed_time; - } else { - wait_time = 0; - } - - /* Exceeded current slice, extend it by another slice time */ - bs->slice_end += BLOCK_IO_SLICE_TIME; - if (wait) { - *wait = wait_time * NANOSECONDS_PER_SECOND; - } - - return true; -} - -static bool bdrv_exceed_io_limits(BlockDriverState *bs, int nb_sectors, - bool is_write, int64_t *wait) -{ - int64_t now, max_wait; - uint64_t bps_wait = 0, iops_wait = 0; - double elapsed_time; - int bps_ret, iops_ret; - - now = qemu_get_clock_ns(vm_clock); - if (now > bs->slice_end) { - bs->slice_start = now; - bs->slice_end = now + BLOCK_IO_SLICE_TIME; - memset(&bs->slice_submitted, 0, sizeof(bs->slice_submitted)); - } - - elapsed_time = now - bs->slice_start; - elapsed_time /= (NANOSECONDS_PER_SECOND); - - bps_ret = bdrv_exceed_bps_limits(bs, nb_sectors, - is_write, elapsed_time, &bps_wait); - iops_ret = bdrv_exceed_iops_limits(bs, is_write, - elapsed_time, &iops_wait); - if (bps_ret || iops_ret) { - max_wait = bps_wait > iops_wait ? bps_wait : iops_wait; - if (wait) { - *wait = max_wait; - } - - now = qemu_get_clock_ns(vm_clock); - if (bs->slice_end < now + max_wait) { - bs->slice_end = now + max_wait; - } - - return true; - } - - if (wait) { - *wait = 0; - } - - bs->slice_submitted.bytes[is_write] += (int64_t)nb_sectors * - BDRV_SECTOR_SIZE; - bs->slice_submitted.ios[is_write]++; - - return false; -} - /**************************************************************/ /* async block device emulation */ @@ -3862,10 +4617,10 @@ static void coroutine_fn bdrv_co_do_rw(void *opaque) if (!acb->is_write) { acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, 0); + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); } else { acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, 0); + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); } acb->bh = qemu_bh_new(bdrv_co_em_bh, acb); @@ -3876,6 +4631,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BdrvRequestFlags flags, BlockDriverCompletionFunc *cb, void *opaque, bool is_write) @@ -3887,6 +4643,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, acb->req.sector = sector_num; acb->req.nb_sectors = nb_sectors; acb->req.qiov = qiov; + acb->req.flags = flags; acb->is_write = is_write; acb->done = NULL; @@ -4108,19 +4865,43 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) return bdrv_co_flush(bs->file); } -void bdrv_invalidate_cache(BlockDriverState *bs) +void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp) { - if (bs->drv && bs->drv->bdrv_invalidate_cache) { - bs->drv->bdrv_invalidate_cache(bs); + Error *local_err = NULL; + int ret; + + if (!bs->drv) { + return; + } + + if (bs->drv->bdrv_invalidate_cache) { + bs->drv->bdrv_invalidate_cache(bs, &local_err); + } else if (bs->file) { + bdrv_invalidate_cache(bs->file, &local_err); + } + if (local_err) { + error_propagate(errp, local_err); + return; + } + + ret = refresh_total_sectors(bs, bs->total_sectors); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not refresh total sector count"); + return; } } -void bdrv_invalidate_cache_all(void) +void bdrv_invalidate_cache_all(Error **errp) { BlockDriverState *bs; + Error *local_err = NULL; - QTAILQ_FOREACH(bs, &bdrv_states, list) { - bdrv_invalidate_cache(bs); + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + bdrv_invalidate_cache(bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } } } @@ -4128,7 +4909,7 @@ void bdrv_clear_incoming_migration_all(void) { BlockDriverState *bs; - QTAILQ_FOREACH(bs, &bdrv_states, list) { + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { bs->open_flags = bs->open_flags & ~(BDRV_O_INCOMING); } } @@ -4155,16 +4936,29 @@ int bdrv_flush(BlockDriverState *bs) return rwco.ret; } +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; static void coroutine_fn bdrv_discard_co_entry(void *opaque) { - RwCo *rwco = opaque; + DiscardCo *rwco = opaque; rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); } +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_DISCARD_DEFAULT 32768 + int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { + int max_discard; + if (!bs->drv) { return -ENOMEDIUM; } else if (bdrv_check_request(bs, sector_num, nb_sectors)) { @@ -4173,40 +4967,68 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return -EROFS; } - if (bs->dirty_bitmap) { - bdrv_reset_dirty(bs, sector_num, nb_sectors); - } + bdrv_reset_dirty(bs, sector_num, nb_sectors); /* Do nothing if disabled. */ if (!(bs->open_flags & BDRV_O_UNMAP)) { return 0; } - if (bs->drv->bdrv_co_discard) { - return bs->drv->bdrv_co_discard(bs, sector_num, nb_sectors); - } else if (bs->drv->bdrv_aio_discard) { - BlockDriverAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + return 0; + } - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - return -EIO; + max_discard = bs->bl.max_discard ? bs->bl.max_discard : MAX_DISCARD_DEFAULT; + while (nb_sectors > 0) { + int ret; + int num = nb_sectors; + + /* align request */ + if (bs->bl.discard_alignment && + num >= bs->bl.discard_alignment && + sector_num % bs->bl.discard_alignment) { + if (num > bs->bl.discard_alignment) { + num = bs->bl.discard_alignment; + } + num -= sector_num % bs->bl.discard_alignment; + } + + /* limit request size */ + if (num > max_discard) { + num = max_discard; + } + + if (bs->drv->bdrv_co_discard) { + ret = bs->drv->bdrv_co_discard(bs, sector_num, num); } else { - qemu_coroutine_yield(); - return co.ret; + BlockDriverAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + return -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } } - } else { - return 0; + if (ret && ret != -ENOTSUP) { + return ret; + } + + sector_num += num; + nb_sectors -= num; } + return 0; } int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { Coroutine *co; - RwCo rwco = { + DiscardCo rwco = { .bs = bs, .sector_num = sector_num, .nb_sectors = nb_sectors, @@ -4311,14 +5133,14 @@ BlockDriverAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, return NULL; } -void bdrv_set_buffer_alignment(BlockDriverState *bs, int align) +void bdrv_set_guest_block_size(BlockDriverState *bs, int align) { - bs->buffer_alignment = align; + bs->guest_block_size = align; } void *qemu_blockalign(BlockDriverState *bs, size_t size) { - return qemu_memalign((bs && bs->buffer_alignment) ? bs->buffer_alignment : 512, size); + return qemu_memalign(bdrv_opt_mem_align(bs), size); } /* @@ -4327,9 +5149,13 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) { int i; + size_t alignment = bdrv_opt_mem_align(bs); for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % bs->buffer_alignment) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; + } + if (qiov->iov[i].iov_len % alignment) { return false; } } @@ -4337,57 +5163,113 @@ bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) return true; } -void bdrv_set_dirty_tracking(BlockDriverState *bs, int granularity) +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, int granularity, + Error **errp) { int64_t bitmap_size; + BdrvDirtyBitmap *bitmap; assert((granularity & (granularity - 1)) == 0); - if (granularity) { - granularity >>= BDRV_SECTOR_BITS; - assert(!bs->dirty_bitmap); - bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS); - bs->dirty_bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); - } else { - if (bs->dirty_bitmap) { - hbitmap_free(bs->dirty_bitmap); - bs->dirty_bitmap = NULL; + granularity >>= BDRV_SECTOR_BITS; + assert(granularity); + bitmap_size = bdrv_getlength(bs); + if (bitmap_size < 0) { + error_setg_errno(errp, -bitmap_size, "could not get length of device"); + errno = -bitmap_size; + return NULL; + } + bitmap_size >>= BDRV_SECTOR_BITS; + bitmap = g_malloc0(sizeof(BdrvDirtyBitmap)); + bitmap->bitmap = hbitmap_alloc(bitmap_size, ffs(granularity) - 1); + QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + return bitmap; +} + +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) +{ + BdrvDirtyBitmap *bm, *next; + QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { + if (bm == bitmap) { + QLIST_REMOVE(bitmap, list); + hbitmap_free(bitmap->bitmap); + g_free(bitmap); + return; } } } -int bdrv_get_dirty(BlockDriverState *bs, int64_t sector) +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bm; + BlockDirtyInfoList *list = NULL; + BlockDirtyInfoList **plist = &list; + + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + BlockDirtyInfo *info = g_malloc0(sizeof(BlockDirtyInfo)); + BlockDirtyInfoList *entry = g_malloc0(sizeof(BlockDirtyInfoList)); + info->count = bdrv_get_dirty_count(bs, bm); + info->granularity = + ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bm->bitmap)); + entry->value = info; + *plist = entry; + plist = &entry->next; + } + + return list; +} + +int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, int64_t sector) { - if (bs->dirty_bitmap) { - return hbitmap_get(bs->dirty_bitmap, sector); + if (bitmap) { + return hbitmap_get(bitmap->bitmap, sector); } else { return 0; } } -void bdrv_dirty_iter_init(BlockDriverState *bs, HBitmapIter *hbi) +void bdrv_dirty_iter_init(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) { - hbitmap_iter_init(hbi, bs->dirty_bitmap, 0); + hbitmap_iter_init(hbi, bitmap->bitmap, 0); } void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - hbitmap_set(bs->dirty_bitmap, cur_sector, nr_sectors); + BdrvDirtyBitmap *bitmap; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); + } } -void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) +void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors) { - hbitmap_reset(bs->dirty_bitmap, cur_sector, nr_sectors); + BdrvDirtyBitmap *bitmap; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); + } } -int64_t bdrv_get_dirty_count(BlockDriverState *bs) +int64_t bdrv_get_dirty_count(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) { - if (bs->dirty_bitmap) { - return hbitmap_count(bs->dirty_bitmap); - } else { - return 0; + return hbitmap_count(bitmap->bitmap); +} + +/* Get a reference to bs */ +void bdrv_ref(BlockDriverState *bs) +{ + bs->refcnt++; +} + +/* Release a previously grabbed reference to bs. + * If after releasing, reference count is zero, the BlockDriverState is + * deleted. */ +void bdrv_unref(BlockDriverState *bs) +{ + assert(bs->refcnt > 0); + if (--bs->refcnt == 0) { + bdrv_delete(bs); } } @@ -4470,9 +5352,9 @@ void bdrv_img_create(const char *filename, const char *fmt, { QEMUOptionParameter *param = NULL, *create_options = NULL; QEMUOptionParameter *backing_fmt, *backing_file, *size; - BlockDriverState *bs = NULL; BlockDriver *drv, *proto_drv; BlockDriver *backing_drv = NULL; + Error *local_err = NULL; int ret = 0; /* Find driver and parse its options */ @@ -4548,6 +5430,7 @@ void bdrv_img_create(const char *filename, const char *fmt, size = get_option_parameter(param, BLOCK_OPT_SIZE); if (size && size->value.n == -1) { if (backing_file && backing_file->value.s) { + BlockDriverState *bs; uint64_t size; char buf[32]; int back_flags; @@ -4556,13 +5439,15 @@ void bdrv_img_create(const char *filename, const char *fmt, back_flags = flags & ~(BDRV_O_RDWR | BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); - bs = bdrv_new(""); - - ret = bdrv_open(bs, backing_file->value.s, NULL, back_flags, - backing_drv); + bs = NULL; + ret = bdrv_open(&bs, backing_file->value.s, NULL, NULL, back_flags, + backing_drv, &local_err); if (ret < 0) { - error_setg_errno(errp, -ret, "Could not open '%s'", - backing_file->value.s); + error_setg_errno(errp, -ret, "Could not open '%s': %s", + backing_file->value.s, + error_get_pretty(local_err)); + error_free(local_err); + local_err = NULL; goto out; } bdrv_get_geometry(bs, &size); @@ -4570,6 +5455,8 @@ void bdrv_img_create(const char *filename, const char *fmt, snprintf(buf, sizeof(buf), "%" PRId64, size); set_option_parameter(param, BLOCK_OPT_SIZE, buf); + + bdrv_unref(bs); } else { error_setg(errp, "Image creation needs a size parameter"); goto out; @@ -4581,30 +5468,27 @@ void bdrv_img_create(const char *filename, const char *fmt, print_option_parameters(param); puts(""); } - ret = bdrv_create(drv, filename, param); - if (ret < 0) { - if (ret == -ENOTSUP) { - error_setg(errp,"Formatting or formatting option not supported for " - "file format '%s'", fmt); - } else if (ret == -EFBIG) { - const char *cluster_size_hint = ""; - if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) { - cluster_size_hint = " (try using a larger cluster size)"; - } - error_setg(errp, "The image size is too large for file format '%s'%s", - fmt, cluster_size_hint); - } else { - error_setg(errp, "%s: error while creating %s: %s", filename, fmt, - strerror(-ret)); + ret = bdrv_create(drv, filename, param, &local_err); + if (ret == -EFBIG) { + /* This is generally a better message than whatever the driver would + * deliver (especially because of the cluster_size_hint), since that + * is most probably not much different from "image too large". */ + const char *cluster_size_hint = ""; + if (get_option_parameter(create_options, BLOCK_OPT_CLUSTER_SIZE)) { + cluster_size_hint = " (try using a larger cluster size)"; } + error_setg(errp, "The image size is too large for file format '%s'" + "%s", fmt, cluster_size_hint); + error_free(local_err); + local_err = NULL; } out: free_option_parameters(create_options); free_option_parameters(param); - if (bs) { - bdrv_delete(bs); + if (local_err) { + error_propagate(errp, local_err); } } @@ -4619,3 +5503,68 @@ void bdrv_add_before_write_notifier(BlockDriverState *bs, { notifier_with_return_list_add(&bs->before_write_notifiers, notifier); } + +int bdrv_amend_options(BlockDriverState *bs, QEMUOptionParameter *options) +{ + if (bs->drv->bdrv_amend_options == NULL) { + return -ENOTSUP; + } + return bs->drv->bdrv_amend_options(bs, options); +} + +/* This function will be called by the bdrv_recurse_is_first_non_filter method + * of block filter and by bdrv_is_first_non_filter. + * It is used to test if the given bs is the candidate or recurse more in the + * node graph. + */ +bool bdrv_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + /* return false if basic checks fails */ + if (!bs || !bs->drv) { + return false; + } + + /* the code reached a non block filter driver -> check if the bs is + * the same as the candidate. It's the recursion termination condition. + */ + if (!bs->drv->is_filter) { + return bs == candidate; + } + /* Down this path the driver is a block filter driver */ + + /* If the block filter recursion method is defined use it to recurse down + * the node graph. + */ + if (bs->drv->bdrv_recurse_is_first_non_filter) { + return bs->drv->bdrv_recurse_is_first_non_filter(bs, candidate); + } + + /* the driver is a block filter but don't allow to recurse -> return false + */ + return false; +} + +/* This function checks if the candidate is the first non filter bs down it's + * bs chain. Since we don't have pointers to parents it explore all bs chains + * from the top. Some filters can choose not to pass down the recursion. + */ +bool bdrv_is_first_non_filter(BlockDriverState *candidate) +{ + BlockDriverState *bs; + + /* walk down the bs forest recursively */ + QTAILQ_FOREACH(bs, &bdrv_states, device_list) { + bool perm; + + /* try to recurse in this top level bs */ + perm = bdrv_recurse_is_first_non_filter(bs, candidate); + + /* candidate is the first non filter */ + if (perm) { + return true; + } + } + + return false; +} diff --git a/block/Makefile.objs b/block/Makefile.objs index 4cf9aa499f..fd88c03ece 100644 --- a/block/Makefile.objs +++ b/block/Makefile.objs @@ -1,8 +1,9 @@ -block-obj-y += raw.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o +block-obj-y += raw_bsd.o cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o -block-obj-y += vhdx.o +block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o +block-obj-$(CONFIG_QUORUM) += quorum.o block-obj-y += parallels.o blkdebug.o blkverify.o block-obj-y += snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o @@ -10,8 +11,9 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o ifeq ($(CONFIG_POSIX),y) -block-obj-y += nbd.o sheepdog.o +block-obj-y += nbd.o nbd-client.o sheepdog.o block-obj-$(CONFIG_LIBISCSI) += iscsi.o +block-obj-$(CONFIG_LIBNFS) += nfs.o block-obj-$(CONFIG_CURL) += curl.o block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o @@ -23,4 +25,15 @@ common-obj-y += commit.o common-obj-y += mirror.o common-obj-y += backup.o -$(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS) +iscsi.o-cflags := $(LIBISCSI_CFLAGS) +iscsi.o-libs := $(LIBISCSI_LIBS) +curl.o-cflags := $(CURL_CFLAGS) +curl.o-libs := $(CURL_LIBS) +rbd.o-cflags := $(RBD_CFLAGS) +rbd.o-libs := $(RBD_LIBS) +gluster.o-cflags := $(GLUSTERFS_CFLAGS) +gluster.o-libs := $(GLUSTERFS_LIBS) +ssh.o-cflags := $(LIBSSH2_CFLAGS) +ssh.o-libs := $(LIBSSH2_LIBS) +qcow.o-libs := -lz +linux-aio.o-libs := -laio diff --git a/block/backup.c b/block/backup.c index 6ae8a05a3e..15a2e55e8e 100644 --- a/block/backup.c +++ b/block/backup.c @@ -138,7 +138,8 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, if (buffer_is_zero(iov.iov_base, iov.iov_len)) { ret = bdrv_co_write_zeroes(job->target, - start * BACKUP_SECTORS_PER_CLUSTER, n); + start * BACKUP_SECTORS_PER_CLUSTER, + n, BDRV_REQ_MAY_UNMAP); } else { ret = bdrv_co_writev(job->target, start * BACKUP_SECTORS_PER_CLUSTER, n, @@ -180,8 +181,13 @@ static int coroutine_fn backup_before_write_notify( void *opaque) { BdrvTrackedRequest *req = opaque; + int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; + int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; - return backup_do_cow(req->bs, req->sector_num, req->nb_sectors, NULL); + assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + return backup_do_cow(req->bs, sector_num, nb_sectors, NULL); } static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -202,9 +208,9 @@ static void backup_iostatus_reset(BlockJob *job) bdrv_iostatus_reset(s->target); } -static const BlockJobType backup_job_type = { +static const BlockJobDriver backup_job_driver = { .instance_size = sizeof(BackupBlockJob), - .job_type = "backup", + .job_type = BLOCK_JOB_TYPE_BACKUP, .set_speed = backup_set_speed, .iostatus_reset = backup_iostatus_reset, }; @@ -272,9 +278,9 @@ static void coroutine_fn backup_run(void *opaque) uint64_t delay_ns = ratelimit_calculate_delay( &job->limit, job->sectors_read); job->sectors_read = 0; - block_job_sleep_ns(&job->common, rt_clock, delay_ns); + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); } else { - block_job_sleep_ns(&job->common, rt_clock, 0); + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); } if (block_job_is_cancelled(&job->common)) { @@ -289,14 +295,14 @@ static void coroutine_fn backup_run(void *opaque) * backing file. */ for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { - /* bdrv_co_is_allocated() only returns true/false based - * on the first set of sectors it comes accross that + /* bdrv_is_allocated() only returns true/false based + * on the first set of sectors it comes across that * are are all in the same state. * For that reason we must verify each sector in the * backup cluster length. We end up copying more than * needed but at some point that is always the case. */ alloced = - bdrv_co_is_allocated(bs, + bdrv_is_allocated(bs, start * BACKUP_SECTORS_PER_CLUSTER + i, BACKUP_SECTORS_PER_CLUSTER - i, &n); i += n; @@ -338,7 +344,7 @@ static void coroutine_fn backup_run(void *opaque) hbitmap_free(job->bitmap); bdrv_iostatus_disable(target); - bdrv_delete(target); + bdrv_unref(target); block_job_completed(&job->common, ret); } @@ -370,7 +376,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, return; } - BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed, + BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, cb, opaque, errp); if (!job) { return; diff --git a/block/blkdebug.c b/block/blkdebug.c index ccb627ad93..380c736101 100644 --- a/block/blkdebug.c +++ b/block/blkdebug.c @@ -168,6 +168,7 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_REFTABLE_LOAD] = "reftable_load", [BLKDBG_REFTABLE_GROW] = "reftable_grow", + [BLKDBG_REFTABLE_UPDATE] = "reftable_update", [BLKDBG_REFBLOCK_LOAD] = "refblock_load", [BLKDBG_REFBLOCK_UPDATE] = "refblock_update", @@ -185,6 +186,14 @@ static const char *event_names[BLKDBG_EVENT_MAX] = { [BLKDBG_FLUSH_TO_OS] = "flush_to_os", [BLKDBG_FLUSH_TO_DISK] = "flush_to_disk", + + [BLKDBG_PWRITEV_RMW_HEAD] = "pwritev_rmw.head", + [BLKDBG_PWRITEV_RMW_AFTER_HEAD] = "pwritev_rmw.after_head", + [BLKDBG_PWRITEV_RMW_TAIL] = "pwritev_rmw.tail", + [BLKDBG_PWRITEV_RMW_AFTER_TAIL] = "pwritev_rmw.after_tail", + [BLKDBG_PWRITEV] = "pwritev", + [BLKDBG_PWRITEV_ZERO] = "pwritev_zero", + [BLKDBG_PWRITEV_DONE] = "pwritev_done", }; static int get_event_by_name(const char *name, BlkDebugEvent *event) @@ -270,19 +279,33 @@ static void remove_rule(BlkdebugRule *rule) g_free(rule); } -static int read_config(BDRVBlkdebugState *s, const char *filename) +static int read_config(BDRVBlkdebugState *s, const char *filename, + QDict *options, Error **errp) { - FILE *f; + FILE *f = NULL; int ret; struct add_rule_data d; + Error *local_err = NULL; - f = fopen(filename, "r"); - if (f == NULL) { - return -errno; + if (filename) { + f = fopen(filename, "r"); + if (f == NULL) { + error_setg_errno(errp, errno, "Could not read blkdebug config file"); + return -errno; + } + + ret = qemu_config_parse(f, config_groups, filename); + if (ret < 0) { + error_setg(errp, "Could not parse blkdebug config file"); + ret = -EINVAL; + goto fail; + } } - ret = qemu_config_parse(f, config_groups, filename); - if (ret < 0) { + qemu_config_parse_qdict(options, config_groups, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; goto fail; } @@ -297,7 +320,9 @@ static int read_config(BDRVBlkdebugState *s, const char *filename) fail: qemu_opts_reset(&inject_error_opts); qemu_opts_reset(&set_state_opts); - fclose(f); + if (f) { + fclose(f); + } return ret; } @@ -309,7 +334,9 @@ static void blkdebug_parse_filename(const char *filename, QDict *options, /* Parse the blkdebug: prefix */ if (!strstart(filename, "blkdebug:", &filename)) { - error_setg(errp, "File name string must start with 'blkdebug:'"); + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); return; } @@ -345,53 +372,68 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_STRING, .help = "[internal use only, will be removed]", }, + { + .name = "align", + .type = QEMU_OPT_SIZE, + .help = "Required alignment in bytes", + }, { /* end of list */ } }, }; -static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags) +static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVBlkdebugState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename, *config; + const char *config; + uint64_t align; int ret; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; - goto fail; + goto out; } - /* Read rules from config file */ + /* Read rules from config file or command line options */ config = qemu_opt_get(opts, "config"); - if (config) { - ret = read_config(s, config); - if (ret < 0) { - goto fail; - } + ret = read_config(s, config, options, errp); + if (ret) { + goto out; } /* Set initial state */ s->state = 1; /* Open the backing file */ - filename = qemu_opt_get(opts, "x-image"); - if (filename == NULL) { - ret = -EINVAL; - goto fail; + assert(bs->file == NULL); + ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image", + flags | BDRV_O_PROTOCOL, false, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto out; } - ret = bdrv_file_open(&bs->file, filename, NULL, flags); - if (ret < 0) { - goto fail; + /* Set request alignment */ + align = qemu_opt_get_size(opts, "align", bs->request_alignment); + if (align > 0 && align < INT_MAX && !(align & (align - 1))) { + bs->request_alignment = align; + } else { + error_setg(errp, "Invalid alignment"); + ret = -EINVAL; + goto fail_unref; } ret = 0; -fail: + goto out; + +fail_unref: + bdrv_unref(bs->file); +out: qemu_opts_del(opts); return ret; } @@ -590,9 +632,9 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event, static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) { BDRVBlkdebugState *s = bs->opaque; - BlkdebugSuspendedReq *r; + BlkdebugSuspendedReq *r, *next; - QLIST_FOREACH(r, &s->suspended_reqs, next) { + QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) { if (!strcmp(r->tag, tag)) { qemu_coroutine_enter(r->co, NULL); return 0; @@ -601,6 +643,31 @@ static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) return -ENOENT; } +static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs, + const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq *r, *r_next; + BlkdebugRule *rule, *next; + int i, ret = -ENOENT; + + for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { + if (rule->action == ACTION_SUSPEND && + !strcmp(rule->options.suspend.tag, tag)) { + remove_rule(rule); + ret = 0; + } + } + } + QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) { + if (!strcmp(r->tag, tag)) { + qemu_coroutine_enter(r->co, NULL); + ret = 0; + } + } + return ret; +} static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag) { @@ -635,6 +702,8 @@ static BlockDriver bdrv_blkdebug = { .bdrv_debug_event = blkdebug_debug_event, .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, + .bdrv_debug_remove_breakpoint + = blkdebug_debug_remove_breakpoint, .bdrv_debug_resume = blkdebug_debug_resume, .bdrv_debug_is_suspended = blkdebug_debug_is_suspended, }; diff --git a/block/blkverify.c b/block/blkverify.c index 1d58cc3932..e1c31171c3 100644 --- a/block/blkverify.c +++ b/block/blkverify.c @@ -78,7 +78,9 @@ static void blkverify_parse_filename(const char *filename, QDict *options, /* Parse the blkverify: prefix */ if (!strstart(filename, "blkverify:", &filename)) { - error_setg(errp, "File name string must start with 'blkverify:'"); + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); return; } @@ -116,46 +118,37 @@ static QemuOptsList runtime_opts = { }, }; -static int blkverify_open(BlockDriverState *bs, QDict *options, int flags) +static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVBlkverifyState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename, *raw; int ret; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } - /* Parse the raw image filename */ - raw = qemu_opt_get(opts, "x-raw"); - if (raw == NULL) { - ret = -EINVAL; - goto fail; - } - - ret = bdrv_file_open(&bs->file, raw, NULL, flags); + /* Open the raw file */ + assert(bs->file == NULL); + ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options, + "raw", flags | BDRV_O_PROTOCOL, false, &local_err); if (ret < 0) { + error_propagate(errp, local_err); goto fail; } /* Open the test file */ - filename = qemu_opt_get(opts, "x-image"); - if (filename == NULL) { - ret = -EINVAL; - goto fail; - } - - s->test_file = bdrv_new(""); - ret = bdrv_open(s->test_file, filename, NULL, flags, NULL); + assert(s->test_file == NULL); + ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options, + "test", flags, false, &local_err); if (ret < 0) { - bdrv_delete(s->test_file); + error_propagate(errp, local_err); s->test_file = NULL; goto fail; } @@ -169,7 +162,7 @@ static void blkverify_close(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; - bdrv_delete(s->test_file); + bdrv_unref(s->test_file); s->test_file = NULL; } @@ -180,110 +173,6 @@ static int64_t blkverify_getlength(BlockDriverState *bs) return bdrv_getlength(s->test_file); } -/** - * Check that I/O vector contents are identical - * - * @a: I/O vector - * @b: I/O vector - * @ret: Offset to first mismatching byte or -1 if match - */ -static ssize_t blkverify_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) -{ - int i; - ssize_t offset = 0; - - assert(a->niov == b->niov); - for (i = 0; i < a->niov; i++) { - size_t len = 0; - uint8_t *p = (uint8_t *)a->iov[i].iov_base; - uint8_t *q = (uint8_t *)b->iov[i].iov_base; - - assert(a->iov[i].iov_len == b->iov[i].iov_len); - while (len < a->iov[i].iov_len && *p++ == *q++) { - len++; - } - - offset += len; - - if (len != a->iov[i].iov_len) { - return offset; - } - } - return -1; -} - -typedef struct { - int src_index; - struct iovec *src_iov; - void *dest_base; -} IOVectorSortElem; - -static int sortelem_cmp_src_base(const void *a, const void *b) -{ - const IOVectorSortElem *elem_a = a; - const IOVectorSortElem *elem_b = b; - - /* Don't overflow */ - if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) { - return -1; - } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) { - return 1; - } else { - return 0; - } -} - -static int sortelem_cmp_src_index(const void *a, const void *b) -{ - const IOVectorSortElem *elem_a = a; - const IOVectorSortElem *elem_b = b; - - return elem_a->src_index - elem_b->src_index; -} - -/** - * Copy contents of I/O vector - * - * The relative relationships of overlapping iovecs are preserved. This is - * necessary to ensure identical semantics in the cloned I/O vector. - */ -static void blkverify_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, - void *buf) -{ - IOVectorSortElem sortelems[src->niov]; - void *last_end; - int i; - - /* Sort by source iovecs by base address */ - for (i = 0; i < src->niov; i++) { - sortelems[i].src_index = i; - sortelems[i].src_iov = &src->iov[i]; - } - qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base); - - /* Allocate buffer space taking into account overlapping iovecs */ - last_end = NULL; - for (i = 0; i < src->niov; i++) { - struct iovec *cur = sortelems[i].src_iov; - ptrdiff_t rewind = 0; - - /* Detect overlap */ - if (last_end && last_end > cur->iov_base) { - rewind = last_end - cur->iov_base; - } - - sortelems[i].dest_base = buf - rewind; - buf += cur->iov_len - MIN(rewind, cur->iov_len); - last_end = MAX(cur->iov_base + cur->iov_len, last_end); - } - - /* Sort by source iovec index and build destination iovec */ - qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index); - for (i = 0; i < src->niov; i++) { - qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len); - } -} - static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, @@ -347,7 +236,7 @@ static void blkverify_aio_cb(void *opaque, int ret) static void blkverify_verify_readv(BlkverifyAIOCB *acb) { - ssize_t offset = blkverify_iovec_compare(acb->qiov, &acb->raw_qiov); + ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov); if (offset != -1) { blkverify_err(acb, "contents mismatch in sector %" PRId64, acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); @@ -365,7 +254,7 @@ static BlockDriverAIOCB *blkverify_aio_readv(BlockDriverState *bs, acb->verify = blkverify_verify_readv; acb->buf = qemu_blockalign(bs->file, qiov->size); qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); - blkverify_iovec_clone(&acb->raw_qiov, qiov, acb->buf); + qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); @@ -399,6 +288,20 @@ static BlockDriverAIOCB *blkverify_aio_flush(BlockDriverState *bs, return bdrv_aio_flush(s->test_file, cb, opaque); } +static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + BDRVBlkverifyState *s = bs->opaque; + + bool perm = bdrv_recurse_is_first_non_filter(bs->file, candidate); + + if (perm) { + return true; + } + + return bdrv_recurse_is_first_non_filter(s->test_file, candidate); +} + static BlockDriver bdrv_blkverify = { .format_name = "blkverify", .protocol_name = "blkverify", @@ -412,6 +315,9 @@ static BlockDriver bdrv_blkverify = { .bdrv_aio_readv = blkverify_aio_readv, .bdrv_aio_writev = blkverify_aio_writev, .bdrv_aio_flush = blkverify_aio_flush, + + .is_filter = true, + .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, }; static void bdrv_blkverify_init(void) diff --git a/block/bochs.c b/block/bochs.c index d7078c0775..eba23df335 100644 --- a/block/bochs.c +++ b/block/bochs.c @@ -38,57 +38,42 @@ // not allocated: 0xffffffff -// always little-endian -struct bochs_header_v1 { - char magic[32]; // "Bochs Virtual HD Image" - char type[16]; // "Redolog" - char subtype[16]; // "Undoable" / "Volatile" / "Growing" - uint32_t version; - uint32_t header; // size of header - - union { - struct { - uint32_t catalog; // num of entries - uint32_t bitmap; // bitmap size - uint32_t extent; // extent size - uint64_t disk; // disk size - char padding[HEADER_SIZE - 64 - 8 - 20]; - } redolog; - char padding[HEADER_SIZE - 64 - 8]; - } extra; -}; - // always little-endian struct bochs_header { - char magic[32]; // "Bochs Virtual HD Image" - char type[16]; // "Redolog" - char subtype[16]; // "Undoable" / "Volatile" / "Growing" + char magic[32]; /* "Bochs Virtual HD Image" */ + char type[16]; /* "Redolog" */ + char subtype[16]; /* "Undoable" / "Volatile" / "Growing" */ uint32_t version; - uint32_t header; // size of header + uint32_t header; /* size of header */ + + uint32_t catalog; /* num of entries */ + uint32_t bitmap; /* bitmap size */ + uint32_t extent; /* extent size */ union { - struct { - uint32_t catalog; // num of entries - uint32_t bitmap; // bitmap size - uint32_t extent; // extent size - uint32_t reserved; // for ??? - uint64_t disk; // disk size - char padding[HEADER_SIZE - 64 - 8 - 24]; - } redolog; - char padding[HEADER_SIZE - 64 - 8]; + struct { + uint32_t reserved; /* for ??? */ + uint64_t disk; /* disk size */ + char padding[HEADER_SIZE - 64 - 20 - 12]; + } QEMU_PACKED redolog; + struct { + uint64_t disk; /* disk size */ + char padding[HEADER_SIZE - 64 - 20 - 8]; + } QEMU_PACKED redolog_v1; + char padding[HEADER_SIZE - 64 - 20]; } extra; -}; +} QEMU_PACKED; typedef struct BDRVBochsState { CoMutex lock; uint32_t *catalog_bitmap; - int catalog_size; + uint32_t catalog_size; - int data_offset; + uint32_t data_offset; - int bitmap_blocks; - int extent_blocks; - int extent_size; + uint32_t bitmap_blocks; + uint32_t extent_blocks; + uint32_t extent_size; } BDRVBochsState; static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -108,12 +93,12 @@ static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int bochs_open(BlockDriverState *bs, QDict *options, int flags) +static int bochs_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVBochsState *s = bs->opaque; - int i; + uint32_t i; struct bochs_header bochs; - struct bochs_header_v1 header_v1; int ret; bs->read_only = 1; // no write support yet @@ -128,17 +113,24 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags) strcmp(bochs.subtype, GROWING_TYPE) || ((le32_to_cpu(bochs.version) != HEADER_VERSION) && (le32_to_cpu(bochs.version) != HEADER_V1))) { - return -EMEDIUMTYPE; + error_setg(errp, "Image not in Bochs format"); + return -EINVAL; } if (le32_to_cpu(bochs.version) == HEADER_V1) { - memcpy(&header_v1, &bochs, sizeof(bochs)); - bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512; + bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512; } else { - bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; + bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; + } + + /* Limit to 1M entries to avoid unbounded allocation. This is what is + * needed for the largest image that bximage can create (~8 TB). */ + s->catalog_size = le32_to_cpu(bochs.catalog); + if (s->catalog_size > 0x100000) { + error_setg(errp, "Catalog size is too large"); + return -EFBIG; } - s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog); s->catalog_bitmap = g_malloc(s->catalog_size * 4); ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, @@ -152,10 +144,34 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags) s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4); - s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512; - s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512; + s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512; + s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512; - s->extent_size = le32_to_cpu(bochs.extra.redolog.extent); + s->extent_size = le32_to_cpu(bochs.extent); + if (s->extent_size < BDRV_SECTOR_SIZE) { + /* bximage actually never creates extents smaller than 4k */ + error_setg(errp, "Extent size must be at least 512"); + ret = -EINVAL; + goto fail; + } else if (!is_power_of_2(s->extent_size)) { + error_setg(errp, "Extent size %" PRIu32 " is not a power of two", + s->extent_size); + ret = -EINVAL; + goto fail; + } else if (s->extent_size > 0x800000) { + error_setg(errp, "Extent size %" PRIu32 " is too large", + s->extent_size); + ret = -EINVAL; + goto fail; + } + + if (s->catalog_size < DIV_ROUND_UP(bs->total_sectors, + s->extent_size / BDRV_SECTOR_SIZE)) + { + error_setg(errp, "Catalog size is too small for this disk size"); + ret = -EINVAL; + goto fail; + } qemu_co_mutex_init(&s->lock); return 0; @@ -168,29 +184,32 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags) static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) { BDRVBochsState *s = bs->opaque; - int64_t offset = sector_num * 512; - int64_t extent_index, extent_offset, bitmap_offset; + uint64_t offset = sector_num * 512; + uint64_t extent_index, extent_offset, bitmap_offset; char bitmap_entry; + int ret; // seek to sector extent_index = offset / s->extent_size; extent_offset = (offset % s->extent_size) / 512; if (s->catalog_bitmap[extent_index] == 0xffffffff) { - return -1; /* not allocated */ + return 0; /* not allocated */ } - bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] * - (s->extent_blocks + s->bitmap_blocks)); + bitmap_offset = s->data_offset + + (512 * (uint64_t) s->catalog_bitmap[extent_index] * + (s->extent_blocks + s->bitmap_blocks)); /* read in bitmap for current extent */ - if (bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), - &bitmap_entry, 1) != 1) { - return -1; + ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), + &bitmap_entry, 1); + if (ret < 0) { + return ret; } if (!((bitmap_entry >> (extent_offset % 8)) & 1)) { - return -1; /* not allocated */ + return 0; /* not allocated */ } return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); @@ -203,13 +222,16 @@ static int bochs_read(BlockDriverState *bs, int64_t sector_num, while (nb_sectors > 0) { int64_t block_offset = seek_to_sector(bs, sector_num); - if (block_offset >= 0) { + if (block_offset < 0) { + return block_offset; + } else if (block_offset > 0) { ret = bdrv_pread(bs->file, block_offset, buf, 512); - if (ret != 512) { - return -1; + if (ret < 0) { + return ret; } - } else + } else { memset(buf, 0, 512); + } nb_sectors--; sector_num++; buf += 512; diff --git a/block/cloop.c b/block/cloop.c index 6ea7cf4046..8457737922 100644 --- a/block/cloop.c +++ b/block/cloop.c @@ -26,6 +26,9 @@ #include "qemu/module.h" #include +/* Maximum compressed block size */ +#define MAX_BLOCK_SIZE (64 * 1024 * 1024) + typedef struct BDRVCloopState { CoMutex lock; uint32_t block_size; @@ -53,7 +56,8 @@ static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cloop_open(BlockDriverState *bs, QDict *options, int flags) +static int cloop_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVCloopState *s = bs->opaque; uint32_t offsets_size, max_compressed_block_size = 1, i; @@ -67,6 +71,26 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags) return ret; } s->block_size = be32_to_cpu(s->block_size); + if (s->block_size % 512) { + error_setg(errp, "block_size %" PRIu32 " must be a multiple of 512", + s->block_size); + return -EINVAL; + } + if (s->block_size == 0) { + error_setg(errp, "block_size cannot be zero"); + return -EINVAL; + } + + /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but + * we can accept more. Prevent ridiculous values like 4 GB - 1 since we + * need a buffer this big. + */ + if (s->block_size > MAX_BLOCK_SIZE) { + error_setg(errp, "block_size %" PRIu32 " must be %u MB or less", + s->block_size, + MAX_BLOCK_SIZE / (1024 * 1024)); + return -EINVAL; + } ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4); if (ret < 0) { @@ -75,7 +99,23 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags) s->n_blocks = be32_to_cpu(s->n_blocks); /* read offsets */ - offsets_size = s->n_blocks * sizeof(uint64_t); + if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) { + /* Prevent integer overflow */ + error_setg(errp, "n_blocks %" PRIu32 " must be %zu or less", + s->n_blocks, + (UINT32_MAX - 1) / sizeof(uint64_t)); + return -EINVAL; + } + offsets_size = (s->n_blocks + 1) * sizeof(uint64_t); + if (offsets_size > 512 * 1024 * 1024) { + /* Prevent ridiculous offsets_size which causes memory allocation to + * fail or overflows bdrv_pread() size. In practice the 512 MB + * offsets[] limit supports 16 TB images at 256 KB block size. + */ + error_setg(errp, "image requires too many offsets, " + "try increasing block size"); + return -EINVAL; + } s->offsets = g_malloc(offsets_size); ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); @@ -83,13 +123,37 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - for(i=0;in_blocks;i++) { + for (i = 0; i < s->n_blocks + 1; i++) { + uint64_t size; + s->offsets[i] = be64_to_cpu(s->offsets[i]); - if (i > 0) { - uint32_t size = s->offsets[i] - s->offsets[i - 1]; - if (size > max_compressed_block_size) { - max_compressed_block_size = size; - } + if (i == 0) { + continue; + } + + if (s->offsets[i] < s->offsets[i - 1]) { + error_setg(errp, "offsets not monotonically increasing at " + "index %" PRIu32 ", image file is corrupt", i); + ret = -EINVAL; + goto fail; + } + + size = s->offsets[i] - s->offsets[i - 1]; + + /* Compressed blocks should be smaller than the uncompressed block size + * but maybe compression performed poorly so the compressed block is + * actually bigger. Clamp down on unrealistic values to prevent + * ridiculous s->compressed_block allocation. + */ + if (size > 2 * MAX_BLOCK_SIZE) { + error_setg(errp, "invalid compressed block size at index %" PRIu32 + ", image file is corrupt", i); + ret = -EINVAL; + goto fail; + } + + if (size > max_compressed_block_size) { + max_compressed_block_size = size; } } @@ -179,9 +243,7 @@ static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, static void cloop_close(BlockDriverState *bs) { BDRVCloopState *s = bs->opaque; - if (s->n_blocks > 0) { - g_free(s->offsets); - } + g_free(s->offsets); g_free(s->compressed_block); g_free(s->uncompressed_block); inflateEnd(&s->zstream); diff --git a/block/commit.c b/block/commit.c index 2227fc2e6c..5c09f4444e 100644 --- a/block/commit.c +++ b/block/commit.c @@ -103,14 +103,14 @@ static void coroutine_fn commit_run(void *opaque) /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } /* Copy if allocated above the base */ - ret = bdrv_co_is_allocated_above(top, base, sector_num, - COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, - &n); + ret = bdrv_is_allocated_above(top, base, sector_num, + COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, + &n); copy = (ret == 1); trace_commit_one_iteration(s, sector_num, n, ret); if (copy) { @@ -173,9 +173,9 @@ static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static const BlockJobType commit_job_type = { +static const BlockJobDriver commit_job_driver = { .instance_size = sizeof(CommitBlockJob), - .job_type = "commit", + .job_type = BLOCK_JOB_TYPE_COMMIT, .set_speed = commit_set_speed, }; @@ -194,17 +194,11 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, if ((on_error == BLOCKDEV_ON_ERROR_STOP || on_error == BLOCKDEV_ON_ERROR_ENOSPC) && !bdrv_iostatus_is_enabled(bs)) { - error_set(errp, QERR_INVALID_PARAMETER_COMBINATION); - return; - } - - /* Once we support top == active layer, remove this check */ - if (top == bs) { - error_setg(errp, - "Top image as the active layer is currently unsupported"); + error_setg(errp, "Invalid parameter combination"); return; } + assert(top != bs); if (top == base) { error_setg(errp, "Invalid files for merge: top and base are the same"); return; @@ -238,7 +232,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, } - s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp); + s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp); if (!s) { return; } diff --git a/block/cow.c b/block/cow.c index 1cc2e89c7c..164759f3a3 100644 --- a/block/cow.c +++ b/block/cow.c @@ -58,7 +58,8 @@ static int cow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int cow_open(BlockDriverState *bs, QDict *options, int flags) +static int cow_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVCowState *s = bs->opaque; struct cow_header_v2 cow_header; @@ -73,15 +74,16 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags) } if (be32_to_cpu(cow_header.magic) != COW_MAGIC) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in COW format"); + ret = -EINVAL; goto fail; } if (be32_to_cpu(cow_header.version) != COW_VERSION) { char version[64]; snprintf(version, sizeof(version), - "COW version %d", cow_header.version); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + "COW version %" PRIu32, cow_header.version); + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "cow", version); ret = -ENOTSUP; goto fail; @@ -102,42 +104,45 @@ static int cow_open(BlockDriverState *bs, QDict *options, int flags) return ret; } -/* - * XXX(hch): right now these functions are extremely inefficient. - * We should just read the whole bitmap we'll need in one go instead. - */ -static inline int cow_set_bit(BlockDriverState *bs, int64_t bitnum) +static inline void cow_set_bits(uint8_t *bitmap, int start, int64_t nb_sectors) { - uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; - uint8_t bitmap; - int ret; - - ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; + int64_t bitnum = start, last = start + nb_sectors; + while (bitnum < last) { + if ((bitnum & 7) == 0 && bitnum + 8 <= last) { + bitmap[bitnum / 8] = 0xFF; + bitnum += 8; + continue; + } + bitmap[bitnum/8] |= (1 << (bitnum % 8)); + bitnum++; } +} - bitmap |= (1 << (bitnum % 8)); +#define BITS_PER_BITMAP_SECTOR (512 * 8) - ret = bdrv_pwrite_sync(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; - } - return 0; +/* Cannot use bitmap.c on big-endian machines. */ +static int cow_test_bit(int64_t bitnum, const uint8_t *bitmap) +{ + return (bitmap[bitnum / 8] & (1 << (bitnum & 7))) != 0; } -static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum) +static int cow_find_streak(const uint8_t *bitmap, int value, int start, int nb_sectors) { - uint64_t offset = sizeof(struct cow_header_v2) + bitnum / 8; - uint8_t bitmap; - int ret; - - ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); - if (ret < 0) { - return ret; + int streak_value = value ? 0xFF : 0; + int last = MIN(start + nb_sectors, BITS_PER_BITMAP_SECTOR); + int bitnum = start; + while (bitnum < last) { + if ((bitnum & 7) == 0 && bitmap[bitnum / 8] == streak_value) { + bitnum += 8; + continue; + } + if (cow_test_bit(bitnum, bitmap) == value) { + bitnum++; + continue; + } + break; } - - return !!(bitmap & (1 << (bitnum % 8))); + return MIN(bitnum, last) - start; } /* Return true if first block has been changed (ie. current version is @@ -146,40 +151,100 @@ static inline int is_bit_set(BlockDriverState *bs, int64_t bitnum) static int coroutine_fn cow_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *num_same) { - int changed; + int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8; + uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE; + bool first = true; + int changed = 0, same = 0; - if (nb_sectors == 0) { - *num_same = nb_sectors; - return 0; - } + do { + int ret; + uint8_t bitmap[BDRV_SECTOR_SIZE]; - changed = is_bit_set(bs, sector_num); - if (changed < 0) { - return 0; /* XXX: how to return I/O errors? */ - } + bitnum &= BITS_PER_BITMAP_SECTOR - 1; + int sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum); - for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) { - if (is_bit_set(bs, sector_num + *num_same) != changed) - break; - } + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + + if (first) { + changed = cow_test_bit(bitnum, bitmap); + first = false; + } + + same += cow_find_streak(bitmap, changed, bitnum, nb_sectors); + + bitnum += sector_bits; + nb_sectors -= sector_bits; + offset += BDRV_SECTOR_SIZE; + } while (nb_sectors); + *num_same = same; return changed; } +static int64_t coroutine_fn cow_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *num_same) +{ + BDRVCowState *s = bs->opaque; + int ret = cow_co_is_allocated(bs, sector_num, nb_sectors, num_same); + int64_t offset = s->cow_sectors_offset + (sector_num << BDRV_SECTOR_BITS); + if (ret < 0) { + return ret; + } + return (ret ? BDRV_BLOCK_DATA : 0) | offset | BDRV_BLOCK_OFFSET_VALID; +} + static int cow_update_bitmap(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { - int error = 0; - int i; + int64_t bitnum = sector_num + sizeof(struct cow_header_v2) * 8; + uint64_t offset = (bitnum / 8) & -BDRV_SECTOR_SIZE; + bool first = true; + int sector_bits; + + for ( ; nb_sectors; + bitnum += sector_bits, + nb_sectors -= sector_bits, + offset += BDRV_SECTOR_SIZE) { + int ret, set; + uint8_t bitmap[BDRV_SECTOR_SIZE]; + + bitnum &= BITS_PER_BITMAP_SECTOR - 1; + sector_bits = MIN(nb_sectors, BITS_PER_BITMAP_SECTOR - bitnum); + + ret = bdrv_pread(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; + } + + /* Skip over any already set bits */ + set = cow_find_streak(bitmap, 1, bitnum, sector_bits); + bitnum += set; + sector_bits -= set; + nb_sectors -= set; + if (!sector_bits) { + continue; + } - for (i = 0; i < nb_sectors; i++) { - error = cow_set_bit(bs, sector_num + i); - if (error) { - break; + if (first) { + ret = bdrv_flush(bs->file); + if (ret < 0) { + return ret; + } + first = false; + } + + cow_set_bits(bitmap, bitnum, sector_bits); + + ret = bdrv_pwrite(bs->file, offset, &bitmap, sizeof(bitmap)); + if (ret < 0) { + return ret; } } - return error; + return 0; } static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num, @@ -189,7 +254,11 @@ static int coroutine_fn cow_read(BlockDriverState *bs, int64_t sector_num, int ret, n; while (nb_sectors > 0) { - if (bdrv_co_is_allocated(bs, sector_num, nb_sectors, &n)) { + ret = cow_co_is_allocated(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + return ret; + } + if (ret) { ret = bdrv_pread(bs->file, s->cow_sectors_offset + sector_num * 512, buf, n * 512); @@ -255,12 +324,14 @@ static void cow_close(BlockDriverState *bs) { } -static int cow_create(const char *filename, QEMUOptionParameter *options) +static int cow_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { struct cow_header_v2 cow_header; struct stat st; int64_t image_sectors = 0; const char *image_filename = NULL; + Error *local_err = NULL; int ret; BlockDriverState *cow_bs; @@ -274,13 +345,17 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) options++; } - ret = bdrv_create_file(filename, options); + ret = bdrv_create_file(filename, options, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&cow_bs, filename, NULL, BDRV_O_RDWR); + cow_bs = NULL; + ret = bdrv_open(&cow_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } @@ -314,7 +389,7 @@ static int cow_create(const char *filename, QEMUOptionParameter *options) } exit: - bdrv_delete(cow_bs); + bdrv_unref(cow_bs); return ret; } @@ -344,7 +419,7 @@ static BlockDriver bdrv_cow = { .bdrv_read = cow_co_read, .bdrv_write = cow_co_write, - .bdrv_co_is_allocated = cow_co_is_allocated, + .bdrv_co_get_block_status = cow_co_get_block_status, .create_options = cow_create_options, }; diff --git a/block/curl.c b/block/curl.c index 82d39ff53f..d2f1084b91 100644 --- a/block/curl.c +++ b/block/curl.c @@ -34,6 +34,11 @@ #define DPRINTF(fmt, ...) do { } while (0) #endif +#if LIBCURL_VERSION_NUM >= 0x071000 +/* The multi interface timer callback was introduced in 7.16.0 */ +#define NEED_CURL_TIMER_CALLBACK +#endif + #define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ CURLPROTO_FTP | CURLPROTO_FTPS | \ CURLPROTO_TFTP) @@ -66,6 +71,7 @@ typedef struct CURLState struct BDRVCURLState *s; CURLAIOCB *acb[CURL_NUM_ACB]; CURL *curl; + curl_socket_t sock_fd; char *orig_buf; size_t buf_start; size_t buf_off; @@ -77,6 +83,7 @@ typedef struct CURLState typedef struct BDRVCURLState { CURLM *multi; + QEMUTimer timer; size_t len; CURLState states[CURL_NUM_STATES]; char *url; @@ -86,25 +93,45 @@ typedef struct BDRVCURLState { static void curl_clean_state(CURLState *s); static void curl_multi_do(void *arg); -static int curl_aio_flush(void *opaque); +static void curl_multi_read(void *arg); + +#ifdef NEED_CURL_TIMER_CALLBACK +static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) +{ + BDRVCURLState *s = opaque; + + DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms); + if (timeout_ms == -1) { + timer_del(&s->timer); + } else { + int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000; + timer_mod(&s->timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns); + } + return 0; +} +#endif static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, void *s, void *sp) { + CURLState *state = NULL; + curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&state); + state->sock_fd = fd; + DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); switch (action) { case CURL_POLL_IN: - qemu_aio_set_fd_handler(fd, curl_multi_do, NULL, curl_aio_flush, s); + qemu_aio_set_fd_handler(fd, curl_multi_read, NULL, state); break; case CURL_POLL_OUT: - qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, curl_aio_flush, s); + qemu_aio_set_fd_handler(fd, NULL, curl_multi_do, state); break; case CURL_POLL_INOUT: - qemu_aio_set_fd_handler(fd, curl_multi_do, curl_multi_do, - curl_aio_flush, s); + qemu_aio_set_fd_handler(fd, curl_multi_read, curl_multi_do, state); break; case CURL_POLL_REMOVE: - qemu_aio_set_fd_handler(fd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(fd, NULL, NULL, NULL); break; } @@ -134,8 +161,13 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) DPRINTF("CURL: Just reading %zd bytes\n", realsize); if (!s || !s->orig_buf) - goto read_end; + return 0; + if (s->buf_off >= s->buf_len) { + /* buffer full, read nothing */ + return 0; + } + realsize = MIN(realsize, s->buf_len - s->buf_off); memcpy(s->orig_buf + s->buf_off, ptr, realsize); s->buf_off += realsize; @@ -154,7 +186,6 @@ static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) } } -read_end: return realsize; } @@ -189,7 +220,8 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, } // Wait for unfinished chunks - if ((start >= state->buf_start) && + if (state->in_use && + (start >= state->buf_start) && (start <= buf_fend) && (end >= state->buf_start) && (end <= buf_fend)) @@ -211,61 +243,87 @@ static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, return FIND_RET_NONE; } -static void curl_multi_do(void *arg) +static void curl_multi_check_completion(BDRVCURLState *s) { - BDRVCURLState *s = (BDRVCURLState *)arg; - int running; - int r; int msgs_in_queue; - if (!s->multi) - return; - - do { - r = curl_multi_socket_all(s->multi, &running); - } while(r == CURLM_CALL_MULTI_PERFORM); - /* Try to find done transfers, so we can free the easy * handle again. */ - do { + for (;;) { CURLMsg *msg; msg = curl_multi_info_read(s->multi, &msgs_in_queue); + /* Quit when there are no more completions */ if (!msg) break; - if (msg->msg == CURLMSG_NONE) - break; - switch (msg->msg) { - case CURLMSG_DONE: - { - CURLState *state = NULL; - curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, (char**)&state); - - /* ACBs for successful messages get completed in curl_read_cb */ - if (msg->data.result != CURLE_OK) { - int i; - for (i = 0; i < CURL_NUM_ACB; i++) { - CURLAIOCB *acb = state->acb[i]; - - if (acb == NULL) { - continue; - } - - acb->common.cb(acb->common.opaque, -EIO); - qemu_aio_release(acb); - state->acb[i] = NULL; + if (msg->msg == CURLMSG_DONE) { + CURLState *state = NULL; + curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, + (char **)&state); + + /* ACBs for successful messages get completed in curl_read_cb */ + if (msg->data.result != CURLE_OK) { + int i; + for (i = 0; i < CURL_NUM_ACB; i++) { + CURLAIOCB *acb = state->acb[i]; + + if (acb == NULL) { + continue; } - } - curl_clean_state(state); - break; + acb->common.cb(acb->common.opaque, -EIO); + qemu_aio_release(acb); + state->acb[i] = NULL; + } } - default: - msgs_in_queue = 0; - break; + + curl_clean_state(state); + break; } - } while(msgs_in_queue); + } +} + +static void curl_multi_do(void *arg) +{ + CURLState *s = (CURLState *)arg; + int running; + int r; + + if (!s->s->multi) { + return; + } + + do { + r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, &running); + } while(r == CURLM_CALL_MULTI_PERFORM); + +} + +static void curl_multi_read(void *arg) +{ + CURLState *s = (CURLState *)arg; + + curl_multi_do(arg); + curl_multi_check_completion(s->s); +} + +static void curl_multi_timeout_do(void *arg) +{ +#ifdef NEED_CURL_TIMER_CALLBACK + BDRVCURLState *s = (BDRVCURLState *)arg; + int running; + + if (!s->multi) { + return; + } + + curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); + + curl_multi_check_completion(s); +#else + abort(); +#endif } static CURLState *curl_init_state(BDRVCURLState *s) @@ -286,44 +344,42 @@ static CURLState *curl_init_state(BDRVCURLState *s) break; } if (!state) { - g_usleep(100); - curl_multi_do(s); + qemu_aio_wait(); } } while(!state); - if (state->curl) - goto has_curl; - - state->curl = curl_easy_init(); - if (!state->curl) - return NULL; - curl_easy_setopt(state->curl, CURLOPT_URL, s->url); - curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5); - curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, (void *)curl_read_cb); - curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); - curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); - curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); - curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); - curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); - curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); - - /* Restrict supported protocols to avoid security issues in the more - * obscure protocols. For example, do not allow POP3/SMTP/IMAP see - * CVE-2013-0249. - * - * Restricting protocols is only supported from 7.19.4 upwards. - */ + if (!state->curl) { + state->curl = curl_easy_init(); + if (!state->curl) { + return NULL; + } + curl_easy_setopt(state->curl, CURLOPT_URL, s->url); + curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, 5); + curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, + (void *)curl_read_cb); + curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); + curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); + curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + + /* Restrict supported protocols to avoid security issues in the more + * obscure protocols. For example, do not allow POP3/SMTP/IMAP see + * CVE-2013-0249. + * + * Restricting protocols is only supported from 7.19.4 upwards. + */ #if LIBCURL_VERSION_NUM >= 0x071304 - curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); - curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); + curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); + curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); #endif #ifdef DEBUG_VERBOSE - curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); + curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); #endif - -has_curl: + } state->s = s; @@ -397,7 +453,8 @@ static QemuOptsList runtime_opts = { }, }; -static int curl_open(BlockDriverState *bs, QDict *options, int flags) +static int curl_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVCURLState *s = bs->opaque; CURLState *state = NULL; @@ -409,30 +466,27 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags) static int inited = 0; if (flags & BDRV_O_RDWR) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, - "curl block device does not support writes"); + error_setg(errp, "curl block device does not support writes"); return -EROFS; } - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); goto out_noclean; } s->readahead_size = qemu_opt_get_size(opts, "readahead", READ_AHEAD_SIZE); if ((s->readahead_size & 0x1ff) != 0) { - fprintf(stderr, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512\n", - s->readahead_size); + error_setg(errp, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512", + s->readahead_size); goto out_noclean; } file = qemu_opt_get(opts, "url"); if (file == NULL) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "curl block driver requires " - "an 'url' option"); + error_setg(errp, "curl block driver requires an 'url' option"); goto out_noclean; } @@ -474,19 +528,25 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags) curl_easy_cleanup(state->curl); state->curl = NULL; + aio_timer_init(bdrv_get_aio_context(bs), &s->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + curl_multi_timeout_do, s); + // Now we know the file exists and its size, so let's // initialize the multi interface! s->multi = curl_multi_init(); - curl_multi_setopt(s->multi, CURLMOPT_SOCKETDATA, s); curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); - curl_multi_do(s); +#ifdef NEED_CURL_TIMER_CALLBACK + curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); + curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); +#endif qemu_opts_del(opts); return 0; out: - fprintf(stderr, "CURL: Error opening file: %s\n", state->errmsg); + error_setg(errp, "CURL: Error opening file: %s", state->errmsg); curl_easy_cleanup(state->curl); state->curl = NULL; out_noclean: @@ -495,21 +555,6 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags) return -EINVAL; } -static int curl_aio_flush(void *opaque) -{ - BDRVCURLState *s = opaque; - int i, j; - - for (i=0; i < CURL_NUM_STATES; i++) { - for(j=0; j < CURL_NUM_ACB; j++) { - if (s->states[i].acb[j]) { - return 1; - } - } - } - return 0; -} - static void curl_aio_cancel(BlockDriverAIOCB *blockacb) { // Do we have to implement canceling? Seems to work without... @@ -524,6 +569,7 @@ static const AIOCBInfo curl_aiocb_info = { static void curl_readv_bh_cb(void *p) { CURLState *state; + int running; CURLAIOCB *acb = p; BDRVCURLState *s = acb->common.bs->opaque; @@ -572,8 +618,9 @@ static void curl_readv_bh_cb(void *p) curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range); curl_multi_add_handle(s->multi, state->curl); - curl_multi_do(s); + /* Tell curl it needs to kick things off */ + curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); } static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs, @@ -589,12 +636,6 @@ static BlockDriverAIOCB *curl_aio_readv(BlockDriverState *bs, acb->nb_sectors = nb_sectors; acb->bh = qemu_bh_new(curl_readv_bh_cb, acb); - - if (!acb->bh) { - DPRINTF("CURL: qemu_bh_new failed\n"); - return NULL; - } - qemu_bh_schedule(acb->bh); return &acb->common; } @@ -619,6 +660,9 @@ static void curl_close(BlockDriverState *bs) } if (s->multi) curl_multi_cleanup(s->multi); + + timer_del(&s->timer); + g_free(s->url); } diff --git a/block/dmg.c b/block/dmg.c index 3141cb5b88..1e153cd76d 100644 --- a/block/dmg.c +++ b/block/dmg.c @@ -27,6 +27,14 @@ #include "qemu/module.h" #include +enum { + /* Limit chunk sizes to prevent unreasonable amounts of memory being used + * or truncating when converting to 32-bit types + */ + DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ + DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, +}; + typedef struct BDRVDMGState { CoMutex lock; /* each chunk contains a certain number of sectors, @@ -92,12 +100,44 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) return 0; } -static int dmg_open(BlockDriverState *bs, QDict *options, int flags) +/* Increase max chunk sizes, if necessary. This function is used to calculate + * the buffer sizes needed for compressed/uncompressed chunk I/O. + */ +static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, + uint32_t *max_compressed_size, + uint32_t *max_sectors_per_chunk) +{ + uint32_t compressed_size = 0; + uint32_t uncompressed_sectors = 0; + + switch (s->types[chunk]) { + case 0x80000005: /* zlib compressed */ + compressed_size = s->lengths[chunk]; + uncompressed_sectors = s->sectorcounts[chunk]; + break; + case 1: /* copy */ + uncompressed_sectors = (s->lengths[chunk] + 511) / 512; + break; + case 2: /* zero */ + uncompressed_sectors = s->sectorcounts[chunk]; + break; + } + + if (compressed_size > *max_compressed_size) { + *max_compressed_size = compressed_size; + } + if (uncompressed_sectors > *max_sectors_per_chunk) { + *max_sectors_per_chunk = uncompressed_sectors; + } +} + +static int dmg_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVDMGState *s = bs->opaque; - uint64_t info_begin,info_end,last_in_offset,last_out_offset; + uint64_t info_begin, info_end, last_in_offset, last_out_offset; uint32_t count, tmp; - uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i; + uint32_t max_compressed_size = 1, max_sectors_per_chunk = 1, i; int64_t offset; int ret; @@ -159,37 +199,40 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - if (type == 0x6d697368 && count >= 244) { - int new_size, chunk_count; + if (type == 0x6d697368 && count >= 244) { + size_t new_size; + uint32_t chunk_count; offset += 4; offset += 200; - chunk_count = (count-204)/40; - new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); - s->types = g_realloc(s->types, new_size/2); - s->offsets = g_realloc(s->offsets, new_size); - s->lengths = g_realloc(s->lengths, new_size); - s->sectors = g_realloc(s->sectors, new_size); - s->sectorcounts = g_realloc(s->sectorcounts, new_size); + chunk_count = (count - 204) / 40; + new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); + s->types = g_realloc(s->types, new_size / 2); + s->offsets = g_realloc(s->offsets, new_size); + s->lengths = g_realloc(s->lengths, new_size); + s->sectors = g_realloc(s->sectors, new_size); + s->sectorcounts = g_realloc(s->sectorcounts, new_size); for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { ret = read_uint32(bs, offset, &s->types[i]); if (ret < 0) { goto fail; } - offset += 4; - if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) { - if(s->types[i]==0xffffffff) { - last_in_offset = s->offsets[i-1]+s->lengths[i-1]; - last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1]; - } - chunk_count--; - i--; - offset += 36; - continue; - } - offset += 4; + offset += 4; + if (s->types[i] != 0x80000005 && s->types[i] != 1 && + s->types[i] != 2) { + if (s->types[i] == 0xffffffff && i > 0) { + last_in_offset = s->offsets[i - 1] + s->lengths[i - 1]; + last_out_offset = s->sectors[i - 1] + + s->sectorcounts[i - 1]; + } + chunk_count--; + i--; + offset += 36; + continue; + } + offset += 4; ret = read_uint64(bs, offset, &s->sectors[i]); if (ret < 0) { @@ -204,6 +247,14 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags) } offset += 8; + if (s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { + error_report("sector count %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); + ret = -EINVAL; + goto fail; + } + ret = read_uint64(bs, offset, &s->offsets[i]); if (ret < 0) { goto fail; @@ -217,19 +268,25 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags) } offset += 8; - if(s->lengths[i]>max_compressed_size) - max_compressed_size = s->lengths[i]; - if(s->sectorcounts[i]>max_sectors_per_chunk) - max_sectors_per_chunk = s->sectorcounts[i]; - } - s->n_chunks+=chunk_count; - } + if (s->lengths[i] > DMG_LENGTHS_MAX) { + error_report("length %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->lengths[i], i, DMG_LENGTHS_MAX); + ret = -EINVAL; + goto fail; + } + + update_max_chunk_size(s, i, &max_compressed_size, + &max_sectors_per_chunk); + } + s->n_chunks += chunk_count; + } } /* initialize zlib engine */ - s->compressed_chunk = g_malloc(max_compressed_size+1); - s->uncompressed_chunk = g_malloc(512*max_sectors_per_chunk); - if(inflateInit(&s->zstream) != Z_OK) { + s->compressed_chunk = g_malloc(max_compressed_size + 1); + s->uncompressed_chunk = g_malloc(512 * max_sectors_per_chunk); + if (inflateInit(&s->zstream) != Z_OK) { ret = -EINVAL; goto fail; } @@ -251,83 +308,82 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags) } static inline int is_sector_in_chunk(BDRVDMGState* s, - uint32_t chunk_num,int sector_num) + uint32_t chunk_num, uint64_t sector_num) { - if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num || - s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num) - return 0; - else - return -1; + if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num || + s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) { + return 0; + } else { + return -1; + } } -static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num) +static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) { /* binary search */ - uint32_t chunk1=0,chunk2=s->n_chunks,chunk3; - while(chunk1!=chunk2) { - chunk3 = (chunk1+chunk2)/2; - if(s->sectors[chunk3]>sector_num) - chunk2 = chunk3; - else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num) - return chunk3; - else - chunk1 = chunk3; + uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3; + while (chunk1 != chunk2) { + chunk3 = (chunk1 + chunk2) / 2; + if (s->sectors[chunk3] > sector_num) { + chunk2 = chunk3; + } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) { + return chunk3; + } else { + chunk1 = chunk3; + } } return s->n_chunks; /* error */ } -static inline int dmg_read_chunk(BlockDriverState *bs, int sector_num) +static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) { BDRVDMGState *s = bs->opaque; - if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) { - int ret; - uint32_t chunk = search_chunk(s,sector_num); - - if(chunk>=s->n_chunks) - return -1; - - s->current_chunk = s->n_chunks; - switch(s->types[chunk]) { - case 0x80000005: { /* zlib compressed */ - int i; - - /* we need to buffer, because only the chunk as whole can be - * inflated. */ - i=0; - do { - ret = bdrv_pread(bs->file, s->offsets[chunk] + i, - s->compressed_chunk+i, s->lengths[chunk]-i); - if(ret<0 && errno==EINTR) - ret=0; - i+=ret; - } while(ret>=0 && ret+ilengths[chunk]); - - if (ret != s->lengths[chunk]) - return -1; - - s->zstream.next_in = s->compressed_chunk; - s->zstream.avail_in = s->lengths[chunk]; - s->zstream.next_out = s->uncompressed_chunk; - s->zstream.avail_out = 512*s->sectorcounts[chunk]; - ret = inflateReset(&s->zstream); - if(ret != Z_OK) - return -1; - ret = inflate(&s->zstream, Z_FINISH); - if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk]) - return -1; - break; } - case 1: /* copy */ - ret = bdrv_pread(bs->file, s->offsets[chunk], + if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { + int ret; + uint32_t chunk = search_chunk(s, sector_num); + + if (chunk >= s->n_chunks) { + return -1; + } + + s->current_chunk = s->n_chunks; + switch (s->types[chunk]) { + case 0x80000005: { /* zlib compressed */ + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + ret = bdrv_pread(bs->file, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + + s->zstream.next_in = s->compressed_chunk; + s->zstream.avail_in = s->lengths[chunk]; + s->zstream.next_out = s->uncompressed_chunk; + s->zstream.avail_out = 512 * s->sectorcounts[chunk]; + ret = inflateReset(&s->zstream); + if (ret != Z_OK) { + return -1; + } + ret = inflate(&s->zstream, Z_FINISH); + if (ret != Z_STREAM_END || + s->zstream.total_out != 512 * s->sectorcounts[chunk]) { + return -1; + } + break; } + case 1: /* copy */ + ret = bdrv_pread(bs->file, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); - if (ret != s->lengths[chunk]) - return -1; - break; - case 2: /* zero */ - memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]); - break; - } - s->current_chunk = chunk; + if (ret != s->lengths[chunk]) { + return -1; + } + break; + case 2: /* zero */ + memset(s->uncompressed_chunk, 0, 512 * s->sectorcounts[chunk]); + break; + } + s->current_chunk = chunk; } return 0; } @@ -338,12 +394,14 @@ static int dmg_read(BlockDriverState *bs, int64_t sector_num, BDRVDMGState *s = bs->opaque; int i; - for(i=0;isectors[s->current_chunk]; - memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512); + for (i = 0; i < nb_sectors; i++) { + uint32_t sector_offset_in_chunk; + if (dmg_read_chunk(bs, sector_num + i) != 0) { + return -1; + } + sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; + memcpy(buf + i * 512, + s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); } return 0; } @@ -375,12 +433,12 @@ static void dmg_close(BlockDriverState *bs) } static BlockDriver bdrv_dmg = { - .format_name = "dmg", - .instance_size = sizeof(BDRVDMGState), - .bdrv_probe = dmg_probe, - .bdrv_open = dmg_open, - .bdrv_read = dmg_co_read, - .bdrv_close = dmg_close, + .format_name = "dmg", + .instance_size = sizeof(BDRVDMGState), + .bdrv_probe = dmg_probe, + .bdrv_open = dmg_open, + .bdrv_read = dmg_co_read, + .bdrv_close = dmg_close, }; static void bdrv_dmg_init(void) diff --git a/block/gluster.c b/block/gluster.c index 645b7f12a5..d0726ec92c 100644 --- a/block/gluster.c +++ b/block/gluster.c @@ -3,43 +3,26 @@ * * Copyright (C) 2012 Bharata B Rao * - * Pipe handling mechanism in AIO implementation is derived from - * block/rbd.c. Hence, + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. * - * Copyright (C) 2010-2011 Christian Brunner , - * Josh Durgin - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. */ #include #include "block/block_int.h" -#include "qemu/sockets.h" #include "qemu/uri.h" typedef struct GlusterAIOCB { - BlockDriverAIOCB common; int64_t size; int ret; - bool *finished; QEMUBH *bh; + Coroutine *coroutine; } GlusterAIOCB; typedef struct BDRVGlusterState { struct glfs *glfs; - int fds[2]; struct glfs_fd *fd; - int qemu_aio_count; - int event_reader_pos; - GlusterAIOCB *event_acb; } BDRVGlusterState; -#define GLUSTER_FD_READ 0 -#define GLUSTER_FD_WRITE 1 - typedef struct GlusterConf { char *server; int port; @@ -50,11 +33,13 @@ typedef struct GlusterConf { static void qemu_gluster_gconf_free(GlusterConf *gconf) { - g_free(gconf->server); - g_free(gconf->volname); - g_free(gconf->image); - g_free(gconf->transport); - g_free(gconf); + if (gconf) { + g_free(gconf->server); + g_free(gconf->volname); + g_free(gconf->image); + g_free(gconf->transport); + g_free(gconf); + } } static int parse_volume_options(GlusterConf *gconf, char *path) @@ -95,7 +80,7 @@ static int parse_volume_options(GlusterConf *gconf, char *path) * 'server' specifies the server where the volume file specification for * the given volume resides. This can be either hostname, ipv4 address * or ipv6 address. ipv6 address needs to be within square brackets [ ]. - * If transport type is 'unix', then 'server' field should not be specifed. + * If transport type is 'unix', then 'server' field should not be specified. * The 'socket' field needs to be populated with the path to unix domain * socket. * @@ -132,7 +117,7 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) } /* transport */ - if (!strcmp(uri->scheme, "gluster")) { + if (!uri->scheme || !strcmp(uri->scheme, "gluster")) { gconf->transport = g_strdup("tcp"); } else if (!strcmp(uri->scheme, "gluster+tcp")) { gconf->transport = g_strdup("tcp"); @@ -168,7 +153,7 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) } gconf->server = g_strdup(qp->p[0].value); } else { - gconf->server = g_strdup(uri->server); + gconf->server = g_strdup(uri->server ? uri->server : "localhost"); gconf->port = uri->port; } @@ -180,7 +165,8 @@ static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) return ret; } -static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) +static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, + Error **errp) { struct glfs *glfs = NULL; int ret; @@ -188,8 +174,8 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) ret = qemu_gluster_parseuri(gconf, filename); if (ret < 0) { - error_report("Usage: file=gluster[+transport]://[server[:port]]/" - "volname/image[?socket=...]"); + error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/" + "volname/image[?socket=...]"); errno = -ret; goto out; } @@ -216,9 +202,16 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) ret = glfs_init(glfs); if (ret) { - error_report("Gluster connection failed for server=%s port=%d " - "volume=%s image=%s transport=%s", gconf->server, gconf->port, - gconf->volname, gconf->image, gconf->transport); + error_setg_errno(errp, errno, + "Gluster connection failed for server=%s port=%d " + "volume=%s image=%s transport=%s", gconf->server, + gconf->port, gconf->volname, gconf->image, + gconf->transport); + + /* glfs_init sometimes doesn't set errno although docs suggest that */ + if (errno == 0) + errno = EINVAL; + goto out; } return glfs; @@ -232,54 +225,32 @@ static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename) return NULL; } -static void qemu_gluster_complete_aio(GlusterAIOCB *acb, BDRVGlusterState *s) +static void qemu_gluster_complete_aio(void *opaque) { - int ret; - bool *finished = acb->finished; - BlockDriverCompletionFunc *cb = acb->common.cb; - void *opaque = acb->common.opaque; - - if (!acb->ret || acb->ret == acb->size) { - ret = 0; /* Success */ - } else if (acb->ret < 0) { - ret = acb->ret; /* Read/Write failed */ - } else { - ret = -EIO; /* Partial read/write - fail it */ - } + GlusterAIOCB *acb = (GlusterAIOCB *)opaque; - s->qemu_aio_count--; - qemu_aio_release(acb); - cb(opaque, ret); - if (finished) { - *finished = true; - } + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_coroutine_enter(acb->coroutine, NULL); } -static void qemu_gluster_aio_event_reader(void *opaque) +/* + * AIO callback routine called from GlusterFS thread. + */ +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) { - BDRVGlusterState *s = opaque; - ssize_t ret; - - do { - char *p = (char *)&s->event_acb; - - ret = read(s->fds[GLUSTER_FD_READ], p + s->event_reader_pos, - sizeof(s->event_acb) - s->event_reader_pos); - if (ret > 0) { - s->event_reader_pos += ret; - if (s->event_reader_pos == sizeof(s->event_acb)) { - s->event_reader_pos = 0; - qemu_gluster_complete_aio(s->event_acb, s); - } - } - } while (ret < 0 && errno == EINTR); -} + GlusterAIOCB *acb = (GlusterAIOCB *)arg; -static int qemu_gluster_aio_flush_cb(void *opaque) -{ - BDRVGlusterState *s = opaque; + if (!ret || ret == acb->size) { + acb->ret = 0; /* Success */ + } else if (ret < 0) { + acb->ret = ret; /* Read/Write failed */ + } else { + acb->ret = -EIO; /* Partial read/write - fail it */ + } - return (s->qemu_aio_count > 0); + acb->bh = qemu_bh_new(qemu_gluster_complete_aio, acb); + qemu_bh_schedule(acb->bh); } /* TODO Convert to fine grained options */ @@ -296,60 +267,57 @@ static QemuOptsList runtime_opts = { }, }; +static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) +{ + assert(open_flags != NULL); + + *open_flags |= O_BINARY; + + if (bdrv_flags & BDRV_O_RDWR) { + *open_flags |= O_RDWR; + } else { + *open_flags |= O_RDONLY; + } + + if ((bdrv_flags & BDRV_O_NOCACHE)) { + *open_flags |= O_DIRECT; + } +} + static int qemu_gluster_open(BlockDriverState *bs, QDict *options, - int bdrv_flags) + int bdrv_flags, Error **errp) { BDRVGlusterState *s = bs->opaque; - int open_flags = O_BINARY; + int open_flags = 0; int ret = 0; GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); QemuOpts *opts; Error *local_err = NULL; const char *filename; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto out; } filename = qemu_opt_get(opts, "filename"); - - s->glfs = qemu_gluster_init(gconf, filename); + s->glfs = qemu_gluster_init(gconf, filename, errp); if (!s->glfs) { ret = -errno; goto out; } - if (bdrv_flags & BDRV_O_RDWR) { - open_flags |= O_RDWR; - } else { - open_flags |= O_RDONLY; - } - - if ((bdrv_flags & BDRV_O_NOCACHE)) { - open_flags |= O_DIRECT; - } + qemu_gluster_parse_flags(bdrv_flags, &open_flags); s->fd = glfs_open(s->glfs, gconf->image, open_flags); if (!s->fd) { ret = -errno; - goto out; } - ret = qemu_pipe(s->fds); - if (ret < 0) { - ret = -errno; - goto out; - } - fcntl(s->fds[GLUSTER_FD_READ], F_SETFL, O_NONBLOCK); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], - qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s); - out: qemu_opts_del(opts); qemu_gluster_gconf_free(gconf); @@ -365,16 +333,159 @@ static int qemu_gluster_open(BlockDriverState *bs, QDict *options, return ret; } +typedef struct BDRVGlusterReopenState { + struct glfs *glfs; + struct glfs_fd *fd; +} BDRVGlusterReopenState; + + +static int qemu_gluster_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + int ret = 0; + BDRVGlusterReopenState *reop_s; + GlusterConf *gconf = NULL; + int open_flags = 0; + + assert(state != NULL); + assert(state->bs != NULL); + + state->opaque = g_malloc0(sizeof(BDRVGlusterReopenState)); + reop_s = state->opaque; + + qemu_gluster_parse_flags(state->flags, &open_flags); + + gconf = g_malloc0(sizeof(GlusterConf)); + + reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp); + if (reop_s->glfs == NULL) { + ret = -errno; + goto exit; + } + + reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); + if (reop_s->fd == NULL) { + /* reops->glfs will be cleaned up in _abort */ + ret = -errno; + goto exit; + } + +exit: + /* state->opaque will be freed in either the _abort or _commit */ + qemu_gluster_gconf_free(gconf); + return ret; +} + +static void qemu_gluster_reopen_commit(BDRVReopenState *state) +{ + BDRVGlusterReopenState *reop_s = state->opaque; + BDRVGlusterState *s = state->bs->opaque; + + + /* close the old */ + if (s->fd) { + glfs_close(s->fd); + } + if (s->glfs) { + glfs_fini(s->glfs); + } + + /* use the newly opened image / connection */ + s->fd = reop_s->fd; + s->glfs = reop_s->glfs; + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + + +static void qemu_gluster_reopen_abort(BDRVReopenState *state) +{ + BDRVGlusterReopenState *reop_s = state->opaque; + + if (reop_s == NULL) { + return; + } + + if (reop_s->fd) { + glfs_close(reop_s->fd); + } + + if (reop_s->glfs) { + glfs_fini(reop_s->glfs); + } + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + +#ifdef CONFIG_GLUSTERFS_ZEROFILL +static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + int ret; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); + BDRVGlusterState *s = bs->opaque; + off_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; + + acb->size = size; + acb->ret = 0; + acb->coroutine = qemu_coroutine_self(); + + ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb); + if (ret < 0) { + ret = -errno; + goto out; + } + + qemu_coroutine_yield(); + ret = acb->ret; + +out: + g_slice_free(GlusterAIOCB, acb); + return ret; +} + +static inline bool gluster_supports_zerofill(void) +{ + return 1; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return glfs_zerofill(fd, offset, size); +} + +#else +static inline bool gluster_supports_zerofill(void) +{ + return 0; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return 0; +} +#endif + static int qemu_gluster_create(const char *filename, - QEMUOptionParameter *options) + QEMUOptionParameter *options, Error **errp) { struct glfs *glfs; struct glfs_fd *fd; int ret = 0; + int prealloc = 0; int64_t total_size = 0; GlusterConf *gconf = g_malloc0(sizeof(GlusterConf)); - glfs = qemu_gluster_init(gconf, filename); + glfs = qemu_gluster_init(gconf, filename, errp); if (!glfs) { ret = -errno; goto out; @@ -383,6 +494,19 @@ static int qemu_gluster_create(const char *filename, while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { total_size = options->value.n / BDRV_SECTOR_SIZE; + } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { + if (!options->value.s || !strcmp(options->value.s, "off")) { + prealloc = 0; + } else if (!strcmp(options->value.s, "full") && + gluster_supports_zerofill()) { + prealloc = 1; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'" + " or GlusterFS doesn't support zerofill API", + options->value.s); + ret = -EINVAL; + goto out; + } } options++; } @@ -392,9 +516,15 @@ static int qemu_gluster_create(const char *filename, if (!fd) { ret = -errno; } else { - if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { + if (!glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE)) { + if (prealloc && qemu_gluster_zerofill(fd, 0, + total_size * BDRV_SECTOR_SIZE)) { + ret = -errno; + } + } else { ret = -errno; } + if (glfs_close(fd) != 0) { ret = -errno; } @@ -407,72 +537,18 @@ static int qemu_gluster_create(const char *filename, return ret; } -static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)blockacb; - bool finished = false; - - acb->finished = &finished; - while (!finished) { - qemu_aio_wait(); - } -} - -static const AIOCBInfo gluster_aiocb_info = { - .aiocb_size = sizeof(GlusterAIOCB), - .cancel = qemu_gluster_aio_cancel, -}; - -static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)arg; - BlockDriverState *bs = acb->common.bs; - BDRVGlusterState *s = bs->opaque; - int retval; - - acb->ret = ret; - retval = qemu_write_full(s->fds[GLUSTER_FD_WRITE], &acb, sizeof(acb)); - if (retval != sizeof(acb)) { - /* - * Gluster AIO callback thread failed to notify the waiting - * QEMU thread about IO completion. - * - * Complete this IO request and make the disk inaccessible for - * subsequent reads and writes. - */ - error_report("Gluster failed to notify QEMU about IO completion"); - - qemu_mutex_lock_iothread(); /* We are in gluster thread context */ - acb->common.cb(acb->common.opaque, -EIO); - qemu_aio_release(acb); - s->qemu_aio_count--; - close(s->fds[GLUSTER_FD_READ]); - close(s->fds[GLUSTER_FD_WRITE]); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, - NULL); - bs->drv = NULL; /* Make the disk inaccessible */ - qemu_mutex_unlock_iothread(); - } -} - -static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque, int write) +static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - size_t size; - off_t offset; - - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - s->qemu_aio_count++; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = size; acb->ret = 0; - acb->finished = NULL; + acb->coroutine = qemu_coroutine_self(); if (write) { ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, @@ -483,14 +559,16 @@ static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, } if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - s->qemu_aio_count--; - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) @@ -506,75 +584,68 @@ static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) return 0; } -static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); } -static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } -static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = 0; acb->ret = 0; - acb->finished = NULL; - s->qemu_aio_count++; + acb->coroutine = qemu_coroutine_self(); ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - s->qemu_aio_count--; - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } #ifdef CONFIG_GLUSTERFS_DISCARD -static BlockDriverAIOCB *qemu_gluster_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BlockDriverCompletionFunc *cb, - void *opaque) +static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) { int ret; - GlusterAIOCB *acb; + GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); BDRVGlusterState *s = bs->opaque; - size_t size; - off_t offset; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; - offset = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - - acb = qemu_aio_get(&gluster_aiocb_info, bs, cb, opaque); acb->size = 0; acb->ret = 0; - acb->finished = NULL; - s->qemu_aio_count++; + acb->coroutine = qemu_coroutine_self(); ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); if (ret < 0) { + ret = -errno; goto out; } - return &acb->common; + + qemu_coroutine_yield(); + ret = acb->ret; out: - s->qemu_aio_count--; - qemu_aio_release(acb); - return NULL; + g_slice_free(GlusterAIOCB, acb); + return ret; } #endif @@ -609,10 +680,6 @@ static void qemu_gluster_close(BlockDriverState *bs) { BDRVGlusterState *s = bs->opaque; - close(s->fds[GLUSTER_FD_READ]); - close(s->fds[GLUSTER_FD_WRITE]); - qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], NULL, NULL, NULL, NULL); - if (s->fd) { glfs_close(s->fd); s->fd = NULL; @@ -632,6 +699,11 @@ static QEMUOptionParameter qemu_gluster_create_options[] = { .type = OPT_SIZE, .help = "Virtual disk size" }, + { + .name = BLOCK_OPT_PREALLOC, + .type = OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, { NULL } }; @@ -639,18 +711,25 @@ static BlockDriver bdrv_gluster = { .format_name = "gluster", .protocol_name = "gluster", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -659,18 +738,25 @@ static BlockDriver bdrv_gluster_tcp = { .format_name = "gluster", .protocol_name = "gluster+tcp", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -679,18 +765,25 @@ static BlockDriver bdrv_gluster_unix = { .format_name = "gluster", .protocol_name = "gluster+unix", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; @@ -699,18 +792,25 @@ static BlockDriver bdrv_gluster_rdma = { .format_name = "gluster", .protocol_name = "gluster+rdma", .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, .bdrv_close = qemu_gluster_close, .bdrv_create = qemu_gluster_create, .bdrv_getlength = qemu_gluster_getlength, .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, .bdrv_truncate = qemu_gluster_truncate, - .bdrv_aio_readv = qemu_gluster_aio_readv, - .bdrv_aio_writev = qemu_gluster_aio_writev, - .bdrv_aio_flush = qemu_gluster_aio_flush, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, .bdrv_has_zero_init = qemu_gluster_has_zero_init, #ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_aio_discard = qemu_gluster_aio_discard, + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, #endif .create_options = qemu_gluster_create_options, }; diff --git a/block/iscsi.c b/block/iscsi.c index e7c1c2b538..52355b8bce 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -2,6 +2,7 @@ * QEMU Block driver for iSCSI images * * Copyright (c) 2010-2011 Ronnie Sahlberg + * Copyright (c) 2012-2014 Peter Lieven * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,6 +34,8 @@ #include "trace.h" #include "block/scsi.h" #include "qemu/iov.h" +#include "sysemu/sysemu.h" +#include "qmp-commands.h" #include #include @@ -50,8 +53,24 @@ typedef struct IscsiLun { uint64_t num_blocks; int events; QEMUTimer *nop_timer; + uint8_t lbpme; + uint8_t lbprz; + uint8_t has_write_same; + struct scsi_inquiry_logical_block_provisioning lbp; + struct scsi_inquiry_block_limits bl; + unsigned char *zeroblock; } IscsiLun; +typedef struct IscsiTask { + int status; + int complete; + int retries; + int do_retry; + struct scsi_task *task; + Coroutine *co; + QEMUBH *bh; +} IscsiTask; + typedef struct IscsiAIOCB { BlockDriverAIOCB common; QEMUIOVector *qiov; @@ -105,6 +124,50 @@ iscsi_schedule_bh(IscsiAIOCB *acb) qemu_bh_schedule(acb->bh); } +static void iscsi_co_generic_bh_cb(void *opaque) +{ + struct IscsiTask *iTask = opaque; + qemu_bh_delete(iTask->bh); + qemu_coroutine_enter(iTask->co, NULL); +} + +static void +iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + struct IscsiTask *iTask = opaque; + struct scsi_task *task = command_data; + + iTask->complete = 1; + iTask->status = status; + iTask->do_retry = 0; + iTask->task = task; + + if (iTask->retries-- > 0 && status == SCSI_STATUS_CHECK_CONDITION + && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { + error_report("iSCSI CheckCondition: %s", iscsi_get_error(iscsi)); + iTask->do_retry = 1; + goto out; + } + + if (status != SCSI_STATUS_GOOD) { + error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); + } + +out: + if (iTask->co) { + iTask->bh = qemu_bh_new(iscsi_co_generic_bh_cb, iTask); + qemu_bh_schedule(iTask->bh); + } +} + +static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask) +{ + *iTask = (struct IscsiTask) { + .co = qemu_coroutine_self(), + .retries = ISCSI_CMD_RETRIES, + }; +} static void iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data, @@ -146,13 +209,6 @@ static const AIOCBInfo iscsi_aiocb_info = { static void iscsi_process_read(void *arg); static void iscsi_process_write(void *arg); -static int iscsi_process_flush(void *arg) -{ - IscsiLun *iscsilun = arg; - - return iscsi_queue_length(iscsilun->iscsi) > 0; -} - static void iscsi_set_events(IscsiLun *iscsilun) { @@ -166,7 +222,6 @@ iscsi_set_events(IscsiLun *iscsilun) qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), iscsi_process_read, (ev & POLLOUT) ? iscsi_process_write : NULL, - iscsi_process_flush, iscsilun); } @@ -194,44 +249,6 @@ iscsi_process_write(void *arg) iscsi_set_events(iscsilun); } -static int -iscsi_aio_writev_acb(IscsiAIOCB *acb); - -static void -iscsi_aio_write16_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - trace_iscsi_aio_write16_cb(iscsi, status, acb, acb->canceled); - - g_free(acb->buf); - acb->buf = NULL; - - if (acb->canceled != 0) { - return; - } - - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_writev_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to write16 data to iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; - } - - iscsi_schedule_bh(acb); -} - static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun) { return sector * iscsilun->block_size / BDRV_SECTOR_SIZE; @@ -256,406 +273,182 @@ static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, return 1; } -static int -iscsi_aio_writev_acb(IscsiAIOCB *acb) +static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) { - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - size_t size; - uint32_t num_sectors; + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; uint64_t lba; -#if !defined(LIBISCSI_FEATURE_IOVECTOR) - struct iscsi_data data; -#endif - int ret; - - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; + uint32_t num_sectors; + uint8_t *data = NULL; + uint8_t *buf = NULL; - /* this will allow us to get rid of 'buf' completely */ - size = acb->nb_sectors * BDRV_SECTOR_SIZE; + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + lba = sector_qemu2lun(sector_num, iscsilun); + num_sectors = sector_qemu2lun(nb_sectors, iscsilun); #if !defined(LIBISCSI_FEATURE_IOVECTOR) - data.size = MIN(size, acb->qiov->size); - /* if the iovec only contains one buffer we can pass it directly */ - if (acb->qiov->niov == 1) { - data.data = acb->qiov->iov[0].iov_base; + if (iov->niov == 1) { + data = iov->iov[0].iov_base; } else { - acb->buf = g_malloc(data.size); - qemu_iovec_to_buf(acb->qiov, 0, acb->buf, data.size); - data.data = acb->buf; + size_t size = MIN(nb_sectors * BDRV_SECTOR_SIZE, iov->size); + buf = g_malloc(size); + qemu_iovec_to_buf(iov, 0, buf, size); + data = buf; } #endif - - acb->task = malloc(sizeof(struct scsi_task)); - if (acb->task == NULL) { - error_report("iSCSI: Failed to allocate task for scsi WRITE16 " - "command. %s", iscsi_get_error(iscsi)); - return -1; + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, + data, num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); + if (iTask.task == NULL) { + g_free(buf); + return -ENOMEM; } - memset(acb->task, 0, sizeof(struct scsi_task)); - - acb->task->xfer_dir = SCSI_XFER_WRITE; - acb->task->cdb_size = 16; - acb->task->cdb[0] = 0x8a; - lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); - *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32); - *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff); - num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun); - *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors); - acb->task->expxferlen = size; - #if defined(LIBISCSI_FEATURE_IOVECTOR) - ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, - iscsi_aio_write16_cb, - NULL, - acb); -#else - ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, - iscsi_aio_write16_cb, - &data, - acb); + scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, + iov->niov); #endif - if (ret != 0) { - scsi_free_scsi_task(acb->task); - g_free(acb->buf); - return -1; + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); } -#if defined(LIBISCSI_FEATURE_IOVECTOR) - scsi_task_set_iov_out(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov); -#endif - - return 0; -} - -static BlockDriverAIOCB * -iscsi_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - IscsiAIOCB *acb; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return NULL; + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; } - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - trace_iscsi_aio_writev(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb); - - acb->iscsilun = iscsilun; - acb->qiov = qiov; - acb->nb_sectors = nb_sectors; - acb->sector_num = sector_num; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_writev_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; } - iscsi_set_events(iscsilun); - return &acb->common; -} - -static int -iscsi_aio_readv_acb(IscsiAIOCB *acb); - -static void -iscsi_aio_read16_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - trace_iscsi_aio_read16_cb(iscsi, status, acb, acb->canceled); + g_free(buf); - if (acb->canceled != 0) { - return; + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; } - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_readv_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to read16 data from iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; - } - - iscsi_schedule_bh(acb); + return 0; } -static int -iscsi_aio_readv_acb(IscsiAIOCB *acb) +static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) { - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - size_t size; + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; - int ret; #if !defined(LIBISCSI_FEATURE_IOVECTOR) int i; #endif - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; - - size = acb->nb_sectors * BDRV_SECTOR_SIZE; - - acb->task = malloc(sizeof(struct scsi_task)); - if (acb->task == NULL) { - error_report("iSCSI: Failed to allocate task for scsi READ16 " - "command. %s", iscsi_get_error(iscsi)); - return -1; + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; } - memset(acb->task, 0, sizeof(struct scsi_task)); - acb->task->xfer_dir = SCSI_XFER_READ; - acb->task->expxferlen = size; - lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); - num_sectors = sector_qemu2lun(acb->nb_sectors, acb->iscsilun); + lba = sector_qemu2lun(sector_num, iscsilun); + num_sectors = sector_qemu2lun(nb_sectors, iscsilun); - switch (acb->iscsilun->type) { + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + switch (iscsilun->type) { case TYPE_DISK: - acb->task->cdb_size = 16; - acb->task->cdb[0] = 0x88; - *(uint32_t *)&acb->task->cdb[2] = htonl(lba >> 32); - *(uint32_t *)&acb->task->cdb[6] = htonl(lba & 0xffffffff); - *(uint32_t *)&acb->task->cdb[10] = htonl(num_sectors); + iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba, + num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); break; default: - acb->task->cdb_size = 10; - acb->task->cdb[0] = 0x28; - *(uint32_t *)&acb->task->cdb[2] = htonl(lba); - *(uint16_t *)&acb->task->cdb[7] = htons(num_sectors); + iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba, + num_sectors * iscsilun->block_size, + iscsilun->block_size, +#if !defined(CONFIG_LIBISCSI_1_4) /* API change from 1.4.0 to 1.5.0 */ + 0, 0, 0, 0, 0, +#endif + iscsi_co_generic_cb, &iTask); break; } - - ret = iscsi_scsi_command_async(iscsi, acb->iscsilun->lun, acb->task, - iscsi_aio_read16_cb, - NULL, - acb); - if (ret != 0) { - scsi_free_scsi_task(acb->task); - return -1; + if (iTask.task == NULL) { + return -ENOMEM; } - #if defined(LIBISCSI_FEATURE_IOVECTOR) - scsi_task_set_iov_in(acb->task, (struct scsi_iovec*) acb->qiov->iov, acb->qiov->niov); + scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); #else - for (i = 0; i < acb->qiov->niov; i++) { - scsi_task_add_data_in_buffer(acb->task, - acb->qiov->iov[i].iov_len, - acb->qiov->iov[i].iov_base); + for (i = 0; i < iov->niov; i++) { + scsi_task_add_data_in_buffer(iTask.task, + iov->iov[i].iov_len, + iov->iov[i].iov_base); } #endif - return 0; -} -static BlockDriverAIOCB * -iscsi_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockDriverCompletionFunc *cb, - void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - IscsiAIOCB *acb; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return NULL; - } - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - trace_iscsi_aio_readv(iscsilun->iscsi, sector_num, nb_sectors, opaque, acb); - - acb->nb_sectors = nb_sectors; - acb->sector_num = sector_num; - acb->iscsilun = iscsilun; - acb->qiov = qiov; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_readv_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); } - iscsi_set_events(iscsilun); - return &acb->common; -} - -static int -iscsi_aio_flush_acb(IscsiAIOCB *acb); - -static void -iscsi_synccache10_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - if (acb->canceled != 0) { - return; + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; } - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_flush_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to sync10 data on iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; } - iscsi_schedule_bh(acb); -} - -static int -iscsi_aio_flush_acb(IscsiAIOCB *acb) -{ - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; - - acb->task = iscsi_synchronizecache10_task(iscsi, acb->iscsilun->lun, - 0, 0, 0, 0, - iscsi_synccache10_cb, - acb); - if (acb->task == NULL) { - error_report("iSCSI: Failed to send synchronizecache10 command. %s", - iscsi_get_error(iscsi)); - return -1; + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; } return 0; } -static BlockDriverAIOCB * -iscsi_aio_flush(BlockDriverState *bs, - BlockDriverCompletionFunc *cb, void *opaque) +static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) { IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; - IscsiAIOCB *acb; - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - - acb->iscsilun = iscsilun; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_flush_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + if (bs->sg) { + return 0; } - iscsi_set_events(iscsilun); - - return &acb->common; -} + iscsi_co_init_iscsitask(iscsilun, &iTask); -static int iscsi_aio_discard_acb(IscsiAIOCB *acb); - -static void -iscsi_unmap_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - if (acb->canceled != 0) { - return; +retry: + if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, + 0, iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; } - acb->status = 0; - if (status != 0) { - if (status == SCSI_STATUS_CHECK_CONDITION - && acb->task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && acb->retries-- > 0) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - if (iscsi_aio_discard_acb(acb) == 0) { - iscsi_set_events(acb->iscsilun); - return; - } - } - error_report("Failed to unmap data on iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = -EIO; + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); } - iscsi_schedule_bh(acb); -} - -static int iscsi_aio_discard_acb(IscsiAIOCB *acb) { - struct iscsi_context *iscsi = acb->iscsilun->iscsi; - struct unmap_list list[1]; - - acb->canceled = 0; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; - - list[0].lba = sector_qemu2lun(acb->sector_num, acb->iscsilun); - list[0].num = acb->nb_sectors * BDRV_SECTOR_SIZE / acb->iscsilun->block_size; - - acb->task = iscsi_unmap_task(iscsi, acb->iscsilun->lun, - 0, 0, &list[0], 1, - iscsi_unmap_cb, - acb); - if (acb->task == NULL) { - error_report("iSCSI: Failed to send unmap command. %s", - iscsi_get_error(iscsi)); - return -1; + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; } - return 0; -} - -static BlockDriverAIOCB * -iscsi_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockDriverCompletionFunc *cb, void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - IscsiAIOCB *acb; - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - - acb->iscsilun = iscsilun; - acb->nb_sectors = nb_sectors; - acb->sector_num = sector_num; - acb->retries = ISCSI_CMD_RETRIES; - - if (iscsi_aio_discard_acb(acb) != 0) { - qemu_aio_release(acb); - return NULL; + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; } - iscsi_set_events(iscsilun); + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; + } - return &acb->common; + return 0; } #ifdef __linux__ @@ -850,7 +643,234 @@ iscsi_getlength(BlockDriverState *bs) return len; } -static int parse_chap(struct iscsi_context *iscsi, const char *target) +#if defined(LIBISCSI_FEATURE_IOVECTOR) + +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + IscsiLun *iscsilun = bs->opaque; + struct scsi_get_lba_status *lbas = NULL; + struct scsi_lba_status_descriptor *lbasd = NULL; + struct IscsiTask iTask; + int64_t ret; + + iscsi_co_init_iscsitask(iscsilun, &iTask); + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + ret = -EINVAL; + goto out; + } + + /* default to all sectors allocated */ + ret = BDRV_BLOCK_DATA; + ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; + *pnum = nb_sectors; + + /* LUN does not support logical block provisioning */ + if (iscsilun->lbpme == 0) { + goto out; + } + +retry: + if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, + sector_qemu2lun(sector_num, iscsilun), + 8 + 16, iscsi_co_generic_cb, + &iTask) == NULL) { + ret = -ENOMEM; + goto out; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.do_retry) { + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + /* in case the get_lba_status_callout fails (i.e. + * because the device is busy or the cmd is not + * supported) we pretend all blocks are allocated + * for backwards compatibility */ + goto out; + } + + lbas = scsi_datain_unmarshall(iTask.task); + if (lbas == NULL) { + ret = -EIO; + goto out; + } + + lbasd = &lbas->descriptors[0]; + + if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { + ret = -EIO; + goto out; + } + + *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } + + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { + ret &= ~BDRV_BLOCK_DATA; + if (iscsilun->lbprz) { + ret |= BDRV_BLOCK_ZERO; + } + } + +out: + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + } + return ret; +} + +#endif /* LIBISCSI_FEATURE_IOVECTOR */ + +static int +coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + struct unmap_list list; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (!iscsilun->lbp.lbpu) { + /* UNMAP is not supported by the target */ + return 0; + } + + list.lba = sector_qemu2lun(sector_num, iscsilun); + list.num = sector_qemu2lun(nb_sectors, iscsilun); + + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, + iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { + /* the target might fail with a check condition if it + is not happy with the alignment of the UNMAP request + we silently fail in this case */ + return 0; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; + } + + return 0; +} + +#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED) + +static int +coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + uint64_t lba; + uint32_t nb_blocks; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { + /* WRITE SAME without UNMAP is not supported by the target */ + return -ENOTSUP; + } + + if ((flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->lbp.lbpws) { + /* WRITE SAME with UNMAP is not supported by the target */ + return -ENOTSUP; + } + + lba = sector_qemu2lun(sector_num, iscsilun); + nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); + + if (iscsilun->zeroblock == NULL) { + iscsilun->zeroblock = g_malloc0(iscsilun->block_size); + } + + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + if (iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, + iscsilun->zeroblock, iscsilun->block_size, + nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), + 0, 0, iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.status == SCSI_STATUS_CHECK_CONDITION && + iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST && + (iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE || + iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB)) { + /* WRITE SAME is not supported by the target */ + iscsilun->has_write_same = false; + scsi_free_scsi_task(iTask.task); + return -ENOTSUP; + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return -EIO; + } + + return 0; +} + +#endif /* SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED */ + +static void parse_chap(struct iscsi_context *iscsi, const char *target, + Error **errp) { QemuOptsList *list; QemuOpts *opts; @@ -859,37 +879,35 @@ static int parse_chap(struct iscsi_context *iscsi, const char *target) list = qemu_find_opts("iscsi"); if (!list) { - return 0; + return; } opts = qemu_opts_find(list, target); if (opts == NULL) { opts = QTAILQ_FIRST(&list->head); if (!opts) { - return 0; + return; } } user = qemu_opt_get(opts, "user"); if (!user) { - return 0; + return; } password = qemu_opt_get(opts, "password"); if (!password) { - error_report("CHAP username specified but no password was given"); - return -1; + error_setg(errp, "CHAP username specified but no password was given"); + return; } if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { - error_report("Failed to set initiator username and password"); - return -1; + error_setg(errp, "Failed to set initiator username and password"); } - - return 0; } -static void parse_header_digest(struct iscsi_context *iscsi, const char *target) +static void parse_header_digest(struct iscsi_context *iscsi, const char *target, + Error **errp) { QemuOptsList *list; QemuOpts *opts; @@ -922,7 +940,7 @@ static void parse_header_digest(struct iscsi_context *iscsi, const char *target) } else if (!strcmp(digest, "NONE-CRC32C")) { iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); } else { - error_report("Invalid header-digest setting : %s", digest); + error_setg(errp, "Invalid header-digest setting : %s", digest); } } @@ -930,8 +948,9 @@ static char *parse_initiator_name(const char *target) { QemuOptsList *list; QemuOpts *opts; - const char *name = NULL; - const char *iscsi_name = qemu_get_vm_name(); + const char *name; + char *iscsi_name; + UuidInfo *uuid_info; list = qemu_find_opts("iscsi"); if (list) { @@ -941,16 +960,22 @@ static char *parse_initiator_name(const char *target) } if (opts) { name = qemu_opt_get(opts, "initiator-name"); + if (name) { + return g_strdup(name); + } } } - if (name) { - return g_strdup(name); + uuid_info = qmp_query_uuid(NULL); + if (strcmp(uuid_info->UUID, UUID_NONE) == 0) { + name = qemu_get_vm_name(); } else { - return g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", - iscsi_name ? ":" : "", - iscsi_name ? iscsi_name : ""); + name = uuid_info->UUID; } + iscsi_name = g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", + name ? ":" : "", name ? name : ""); + qapi_free_UuidInfo(uuid_info); + return iscsi_name; } #if defined(LIBISCSI_FEATURE_NOP_COUNTER) @@ -968,17 +993,16 @@ static void iscsi_nop_timed_event(void *opaque) return; } - qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL); + timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); iscsi_set_events(iscsilun); } #endif -static int iscsi_readcapacity_sync(IscsiLun *iscsilun) +static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) { struct scsi_task *task = NULL; struct scsi_readcapacity10 *rc10 = NULL; struct scsi_readcapacity16 *rc16 = NULL; - int ret = 0; int retries = ISCSI_CMD_RETRIES; do { @@ -993,11 +1017,12 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun) if (task != NULL && task->status == SCSI_STATUS_GOOD) { rc16 = scsi_datain_unmarshall(task); if (rc16 == NULL) { - error_report("iSCSI: Failed to unmarshall readcapacity16 data."); - ret = -EINVAL; + error_setg(errp, "iSCSI: Failed to unmarshall readcapacity16 data."); } else { iscsilun->block_size = rc16->block_length; iscsilun->num_blocks = rc16->returned_lba + 1; + iscsilun->lbpme = rc16->lbpme; + iscsilun->lbprz = rc16->lbprz; } } break; @@ -1006,8 +1031,7 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun) if (task != NULL && task->status == SCSI_STATUS_GOOD) { rc10 = scsi_datain_unmarshall(task); if (rc10 == NULL) { - error_report("iSCSI: Failed to unmarshall readcapacity10 data."); - ret = -EINVAL; + error_setg(errp, "iSCSI: Failed to unmarshall readcapacity10 data."); } else { iscsilun->block_size = rc10->block_size; if (rc10->lba == 0) { @@ -1020,20 +1044,18 @@ static int iscsi_readcapacity_sync(IscsiLun *iscsilun) } break; default: - return 0; + return; } } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION && task->sense.key == SCSI_SENSE_UNIT_ATTENTION && retries-- > 0); if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_report("iSCSI: failed to send readcapacity10 command."); - ret = -EINVAL; + error_setg(errp, "iSCSI: failed to send readcapacity10 command."); } if (task) { scsi_free_scsi_task(task); } - return ret; } /* TODO Convert to fine grained options */ @@ -1050,45 +1072,87 @@ static QemuOptsList runtime_opts = { }, }; +static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun, + int evpd, int pc, void **inq, Error **errp) +{ + int full_size; + struct scsi_task *task = NULL; + task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, 64); + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + goto fail; + } + full_size = scsi_datain_getfullsize(task); + if (full_size > task->datain.size) { + scsi_free_scsi_task(task); + + /* we need more data for the full list */ + task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, full_size); + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + goto fail; + } + } + + *inq = scsi_datain_unmarshall(task); + if (*inq == NULL) { + error_setg(errp, "iSCSI: failed to unmarshall inquiry datain blob"); + goto fail_with_err; + } + + return task; + +fail: + error_setg(errp, "iSCSI: Inquiry command failed : %s", + iscsi_get_error(iscsi)); +fail_with_err: + if (task != NULL) { + scsi_free_scsi_task(task); + } + return NULL; +} + /* * We support iscsi url's on the form * iscsi://[%@][:]// + * + * Note: flags are currently not used by iscsi_open. If this function + * is changed such that flags are used, please examine iscsi_reopen_prepare() + * to see if needs to be changed as well. */ -static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) +static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { IscsiLun *iscsilun = bs->opaque; struct iscsi_context *iscsi = NULL; struct iscsi_url *iscsi_url = NULL; struct scsi_task *task = NULL; struct scsi_inquiry_standard *inq = NULL; + struct scsi_inquiry_supported_pages *inq_vpd; char *initiator_name = NULL; QemuOpts *opts; Error *local_err = NULL; const char *filename; - int ret; + int i, ret; if ((BDRV_SECTOR_SIZE % 512) != 0) { - error_report("iSCSI: Invalid BDRV_SECTOR_SIZE. " - "BDRV_SECTOR_SIZE(%lld) is not a multiple " - "of 512", BDRV_SECTOR_SIZE); + error_setg(errp, "iSCSI: Invalid BDRV_SECTOR_SIZE. " + "BDRV_SECTOR_SIZE(%lld) is not a multiple " + "of 512", BDRV_SECTOR_SIZE); return -EINVAL; } - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto out; } filename = qemu_opt_get(opts, "filename"); - iscsi_url = iscsi_parse_full_url(iscsi, filename); if (iscsi_url == NULL) { - error_report("Failed to parse URL : %s", filename); + error_setg(errp, "Failed to parse URL : %s", filename); ret = -EINVAL; goto out; } @@ -1099,13 +1163,13 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) iscsi = iscsi_create_context(initiator_name); if (iscsi == NULL) { - error_report("iSCSI: Failed to create iSCSI context."); + error_setg(errp, "iSCSI: Failed to create iSCSI context."); ret = -ENOMEM; goto out; } if (iscsi_set_targetname(iscsi, iscsi_url->target)) { - error_report("iSCSI: Failed to set target name."); + error_setg(errp, "iSCSI: Failed to set target name."); ret = -EINVAL; goto out; } @@ -1114,21 +1178,22 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user, iscsi_url->passwd); if (ret != 0) { - error_report("Failed to set initiator username and password"); + error_setg(errp, "Failed to set initiator username and password"); ret = -EINVAL; goto out; } } /* check if we got CHAP username/password via the options */ - if (parse_chap(iscsi, iscsi_url->target) != 0) { - error_report("iSCSI: Failed to set CHAP user/password"); + parse_chap(iscsi, iscsi_url->target, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); ret = -EINVAL; goto out; } if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) { - error_report("iSCSI: Failed to set session type to normal."); + error_setg(errp, "iSCSI: Failed to set session type to normal."); ret = -EINVAL; goto out; } @@ -1136,10 +1201,15 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); /* check if we got HEADER_DIGEST via the options */ - parse_header_digest(iscsi, iscsi_url->target); + parse_header_digest(iscsi, iscsi_url->target, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) { - error_report("iSCSI: Failed to connect to LUN : %s", + error_setg(errp, "iSCSI: Failed to connect to LUN : %s", iscsi_get_error(iscsi)); ret = -EINVAL; goto out; @@ -1147,42 +1217,82 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags) iscsilun->iscsi = iscsi; iscsilun->lun = iscsi_url->lun; + iscsilun->has_write_same = true; - task = iscsi_inquiry_sync(iscsi, iscsilun->lun, 0, 0, 36); - - if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_report("iSCSI: failed to send inquiry command."); - ret = -EINVAL; - goto out; - } - - inq = scsi_datain_unmarshall(task); - if (inq == NULL) { - error_report("iSCSI: Failed to unmarshall inquiry data."); + task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0, + (void **) &inq, errp); + if (task == NULL) { ret = -EINVAL; goto out; } - iscsilun->type = inq->periperal_device_type; + scsi_free_scsi_task(task); + task = NULL; - if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) { + iscsi_readcapacity_sync(iscsilun, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; goto out; } bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); + bs->request_alignment = iscsilun->block_size; - /* Medium changer or tape. We dont have any emulation for this so this must - * be sg ioctl compatible. We force it to be sg, otherwise qemu will try - * to read from the device to guess the image format. + /* We don't have any emulation for devices other than disks and CD-ROMs, so + * this must be sg ioctl compatible. We force it to be sg, otherwise qemu + * will try to read from the device to guess the image format. */ - if (iscsilun->type == TYPE_MEDIUM_CHANGER || - iscsilun->type == TYPE_TAPE) { + if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) { bs->sg = 1; } + task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_SUPPORTED_VPD_PAGES, + (void **) &inq_vpd, errp); + if (task == NULL) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < inq_vpd->num_pages; i++) { + struct scsi_task *inq_task; + struct scsi_inquiry_logical_block_provisioning *inq_lbp; + struct scsi_inquiry_block_limits *inq_bl; + switch (inq_vpd->pages[i]) { + case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING: + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING, + (void **) &inq_lbp, errp); + if (inq_task == NULL) { + ret = -EINVAL; + goto out; + } + memcpy(&iscsilun->lbp, inq_lbp, + sizeof(struct scsi_inquiry_logical_block_provisioning)); + scsi_free_scsi_task(inq_task); + break; + case SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS: + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS, + (void **) &inq_bl, errp); + if (inq_task == NULL) { + ret = -EINVAL; + goto out; + } + memcpy(&iscsilun->bl, inq_bl, + sizeof(struct scsi_inquiry_block_limits)); + scsi_free_scsi_task(inq_task); + break; + default: + break; + } + } + scsi_free_scsi_task(task); + task = NULL; + #if defined(LIBISCSI_FEATURE_NOP_COUNTER) /* Set up a timer for sending out iSCSI NOPs */ - iscsilun->nop_timer = qemu_new_timer_ms(rt_clock, iscsi_nop_timed_event, iscsilun); - qemu_mod_timer(iscsilun->nop_timer, qemu_get_clock_ms(rt_clock) + NOP_INTERVAL); + iscsilun->nop_timer = timer_new_ms(QEMU_CLOCK_REALTIME, iscsi_nop_timed_event, iscsilun); + timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); #endif out: @@ -1212,25 +1322,66 @@ static void iscsi_close(BlockDriverState *bs) struct iscsi_context *iscsi = iscsilun->iscsi; if (iscsilun->nop_timer) { - qemu_del_timer(iscsilun->nop_timer); - qemu_free_timer(iscsilun->nop_timer); + timer_del(iscsilun->nop_timer); + timer_free(iscsilun->nop_timer); } - qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL); iscsi_destroy_context(iscsi); + g_free(iscsilun->zeroblock); memset(iscsilun, 0, sizeof(IscsiLun)); } +static int iscsi_refresh_limits(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + + /* We don't actually refresh here, but just return data queried in + * iscsi_open(): iscsi targets don't change their limits. */ + if (iscsilun->lbp.lbpu) { + if (iscsilun->bl.max_unmap < 0xffffffff) { + bs->bl.max_discard = sector_lun2qemu(iscsilun->bl.max_unmap, + iscsilun); + } + bs->bl.discard_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, + iscsilun); + } + + if (iscsilun->bl.max_ws_len < 0xffffffff) { + bs->bl.max_write_zeroes = sector_lun2qemu(iscsilun->bl.max_ws_len, + iscsilun); + } + if (iscsilun->lbp.lbpws) { + bs->bl.write_zeroes_alignment = sector_lun2qemu(iscsilun->bl.opt_unmap_gran, + iscsilun); + } + bs->bl.opt_transfer_length = sector_lun2qemu(iscsilun->bl.opt_xfer_len, + iscsilun); + return 0; +} + +/* Since iscsi_open() ignores bdrv_flags, there is nothing to do here in + * prepare. Note that this will not re-establish a connection with an iSCSI + * target - it is effectively a NOP. */ +static int iscsi_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + /* NOP */ + return 0; +} + static int iscsi_truncate(BlockDriverState *bs, int64_t offset) { IscsiLun *iscsilun = bs->opaque; - int ret = 0; + Error *local_err = NULL; if (iscsilun->type != TYPE_DISK) { return -ENOTSUP; } - if ((ret = iscsi_readcapacity_sync(iscsilun)) != 0) { - return ret; + iscsi_readcapacity_sync(iscsilun, &local_err); + if (local_err != NULL) { + error_free(local_err); + return -EIO; } if (offset > iscsi_getlength(bs)) { @@ -1240,20 +1391,16 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset) return 0; } -static int iscsi_has_zero_init(BlockDriverState *bs) -{ - return 0; -} - -static int iscsi_create(const char *filename, QEMUOptionParameter *options) +static int iscsi_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int ret = 0; int64_t total_size = 0; - BlockDriverState bs; + BlockDriverState *bs; IscsiLun *iscsilun = NULL; QDict *bs_options; - memset(&bs, 0, sizeof(BlockDriverState)); + bs = bdrv_new("", &error_abort); /* Read out options */ while (options && options->name) { @@ -1263,26 +1410,26 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options) options++; } - bs.opaque = g_malloc0(sizeof(struct IscsiLun)); - iscsilun = bs.opaque; + bs->opaque = g_malloc0(sizeof(struct IscsiLun)); + iscsilun = bs->opaque; bs_options = qdict_new(); qdict_put(bs_options, "filename", qstring_from_str(filename)); - ret = iscsi_open(&bs, bs_options, 0); + ret = iscsi_open(bs, bs_options, 0, NULL); QDECREF(bs_options); if (ret != 0) { goto out; } if (iscsilun->nop_timer) { - qemu_del_timer(iscsilun->nop_timer); - qemu_free_timer(iscsilun->nop_timer); + timer_del(iscsilun->nop_timer); + timer_free(iscsilun->nop_timer); } if (iscsilun->type != TYPE_DISK) { ret = -ENODEV; goto out; } - if (bs.total_sectors < total_size) { + if (bs->total_sectors < total_size) { ret = -ENOSPC; goto out; } @@ -1292,10 +1439,27 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options) if (iscsilun->iscsi != NULL) { iscsi_destroy_context(iscsilun->iscsi); } - g_free(bs.opaque); + g_free(bs->opaque); + bs->opaque = NULL; + bdrv_unref(bs); return ret; } +static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + IscsiLun *iscsilun = bs->opaque; + bdi->unallocated_blocks_are_zero = !!iscsilun->lbprz; + bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; + /* Guess the internal cluster (page) size of the iscsi target by the means + * of opt_unmap_gran. Transfer the unmap granularity only if it has a + * reasonable size for bdi->cluster_size */ + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 64 * 1024 && + iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { + bdi->cluster_size = iscsilun->bl.opt_unmap_gran * iscsilun->block_size; + } + return 0; +} + static QEMUOptionParameter iscsi_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1310,20 +1474,28 @@ static BlockDriver bdrv_iscsi = { .protocol_name = "iscsi", .instance_size = sizeof(IscsiLun), + .bdrv_needs_filename = true, .bdrv_file_open = iscsi_open, .bdrv_close = iscsi_close, .bdrv_create = iscsi_create, .create_options = iscsi_create_options, + .bdrv_reopen_prepare = iscsi_reopen_prepare, .bdrv_getlength = iscsi_getlength, + .bdrv_get_info = iscsi_get_info, .bdrv_truncate = iscsi_truncate, + .bdrv_refresh_limits = iscsi_refresh_limits, - .bdrv_aio_readv = iscsi_aio_readv, - .bdrv_aio_writev = iscsi_aio_writev, - .bdrv_aio_flush = iscsi_aio_flush, - - .bdrv_aio_discard = iscsi_aio_discard, - .bdrv_has_zero_init = iscsi_has_zero_init, +#if defined(LIBISCSI_FEATURE_IOVECTOR) + .bdrv_co_get_block_status = iscsi_co_get_block_status, +#endif + .bdrv_co_discard = iscsi_co_discard, +#if defined(SCSI_SENSE_ASCQ_CAPACITY_DATA_HAS_CHANGED) + .bdrv_co_write_zeroes = iscsi_co_write_zeroes, +#endif + .bdrv_co_readv = iscsi_co_readv, + .bdrv_co_writev = iscsi_co_writev, + .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ .bdrv_ioctl = iscsi_ioctl, diff --git a/block/linux-aio.c b/block/linux-aio.c index ee0f8d10c9..53434e2df5 100644 --- a/block/linux-aio.c +++ b/block/linux-aio.c @@ -39,7 +39,6 @@ struct qemu_laiocb { struct qemu_laio_state { io_context_t ctx; EventNotifier e; - int count; }; static inline ssize_t io_event_ret(struct io_event *ev) @@ -55,8 +54,6 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s, { int ret; - s->count--; - ret = laiocb->ret; if (ret != -ECANCELED) { if (ret == laiocb->nbytes) { @@ -101,13 +98,6 @@ static void qemu_laio_completion_cb(EventNotifier *e) } } -static int qemu_laio_flush_cb(EventNotifier *e) -{ - struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); - - return (s->count > 0) ? 1 : 0; -} - static void laio_cancel(BlockDriverAIOCB *blockacb) { struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; @@ -177,14 +167,11 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, goto out_free_aiocb; } io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); - s->count++; if (io_submit(s->ctx, 1, &iocbs) < 0) - goto out_dec_count; + goto out_free_aiocb; return &laiocb->common; -out_dec_count: - s->count--; out_free_aiocb: qemu_aio_release(laiocb); return NULL; @@ -203,8 +190,7 @@ void *laio_init(void) goto out_close_efd; } - qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb, - qemu_laio_flush_cb); + qemu_aio_set_event_notifier(&s->e, qemu_laio_completion_cb); return s; diff --git a/block/mirror.c b/block/mirror.c index bed4a7eadd..1c38aa8f77 100644 --- a/block/mirror.c +++ b/block/mirror.c @@ -31,7 +31,8 @@ typedef struct MirrorBlockJob { BlockJob common; RateLimit limit; BlockDriverState *target; - MirrorSyncMode mode; + BlockDriverState *base; + bool is_none_mode; BlockdevOnError on_source_error, on_target_error; bool synced; bool should_complete; @@ -39,6 +40,7 @@ typedef struct MirrorBlockJob { int64_t granularity; size_t buf_size; unsigned long *cow_bitmap; + BdrvDirtyBitmap *dirty_bitmap; HBitmapIter hbi; uint8_t *buf; QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; @@ -94,8 +96,16 @@ static void mirror_iteration_done(MirrorOp *op, int ret) bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); } + qemu_iovec_destroy(&op->qiov); g_slice_free(MirrorOp, op); - qemu_coroutine_enter(s->common.co, NULL); + + /* Enter coroutine when it is not sleeping. The coroutine sleeps to + * rate-limit itself. The coroutine will eventually resume since there is + * a sleep timeout so don't wake it early. + */ + if (s->common.busy) { + qemu_coroutine_enter(s->common.co, NULL); + } } static void mirror_write_complete(void *opaque, int ret) @@ -136,18 +146,20 @@ static void mirror_read_complete(void *opaque, int ret) mirror_write_complete, op); } -static void coroutine_fn mirror_iteration(MirrorBlockJob *s) +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) { BlockDriverState *source = s->common.bs; int nb_sectors, sectors_per_chunk, nb_chunks; int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; + uint64_t delay_ns; MirrorOp *op; s->sector_num = hbitmap_iter_next(&s->hbi); if (s->sector_num < 0) { - bdrv_dirty_iter_init(source, &s->hbi); + bdrv_dirty_iter_init(source, s->dirty_bitmap, &s->hbi); s->sector_num = hbitmap_iter_next(&s->hbi); - trace_mirror_restart_iter(s, bdrv_get_dirty_count(source)); + trace_mirror_restart_iter(s, + bdrv_get_dirty_count(source, s->dirty_bitmap)); assert(s->sector_num >= 0); } @@ -183,7 +195,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) do { int added_sectors, added_chunks; - if (!bdrv_get_dirty(source, next_sector) || + if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) || test_bit(next_chunk, s->in_flight_bitmap)) { assert(nb_sectors > 0); break; @@ -227,7 +239,12 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) nb_chunks += added_chunks; next_sector += added_sectors; next_chunk += added_chunks; - } while (next_sector < end); + if (!s->synced && s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors); + } else { + delay_ns = 0; + } + } while (delay_ns == 0 && next_sector < end); /* Allocate a MirrorOp that is used as an AIO callback. */ op = g_slice_new(MirrorOp); @@ -249,7 +266,8 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) /* Advance the HBitmapIter in parallel, so that we do not examine * the same sector twice. */ - if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) { + if (next_sector > hbitmap_next_sector + && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { hbitmap_next_sector = hbitmap_iter_next(&s->hbi); } @@ -263,6 +281,7 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s) trace_mirror_one_iteration(s, sector_num, nb_sectors); bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, mirror_read_complete, op); + return delay_ns; } static void mirror_free_init(MirrorBlockJob *s) @@ -306,11 +325,11 @@ static void coroutine_fn mirror_run(void *opaque) s->common.len = bdrv_getlength(bs); if (s->common.len <= 0) { - block_job_completed(&s->common, s->common.len); - return; + ret = s->common.len; + goto immediate_exit; } - length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity; + length = DIV_ROUND_UP(s->common.len, s->granularity); s->in_flight_bitmap = bitmap_new(length); /* If we have no backing file yet in the destination, we cannot let @@ -320,7 +339,10 @@ static void coroutine_fn mirror_run(void *opaque) bdrv_get_backing_filename(s->target, backing_filename, sizeof(backing_filename)); if (backing_filename[0] && !s->target->backing_hd) { - bdrv_get_info(s->target, &bdi); + ret = bdrv_get_info(s->target, &bdi); + if (ret < 0) { + goto immediate_exit; + } if (s->granularity < bdi.cluster_size) { s->buf_size = MAX(s->buf_size, bdi.cluster_size); s->cow_bitmap = bitmap_new(length); @@ -332,14 +354,13 @@ static void coroutine_fn mirror_run(void *opaque) sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; mirror_free_init(s); - if (s->mode != MIRROR_SYNC_MODE_NONE) { + if (!s->is_none_mode) { /* First part, loop on the sectors and initialize the dirty bitmap. */ - BlockDriverState *base; - base = s->mode == MIRROR_SYNC_MODE_FULL ? NULL : bs->backing_hd; + BlockDriverState *base = s->base; for (sector_num = 0; sector_num < end; ) { int64_t next = (sector_num | (sectors_per_chunk - 1)) + 1; - ret = bdrv_co_is_allocated_above(bs, base, - sector_num, next - sector_num, &n); + ret = bdrv_is_allocated_above(bs, base, + sector_num, next - sector_num, &n); if (ret < 0) { goto immediate_exit; @@ -355,10 +376,10 @@ static void coroutine_fn mirror_run(void *opaque) } } - bdrv_dirty_iter_init(bs, &s->hbi); - last_pause_ns = qemu_get_clock_ns(rt_clock); + bdrv_dirty_iter_init(bs, s->dirty_bitmap, &s->hbi); + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); for (;;) { - uint64_t delay_ns; + uint64_t delay_ns = 0; int64_t cnt; bool should_complete; @@ -367,14 +388,14 @@ static void coroutine_fn mirror_run(void *opaque) goto immediate_exit; } - cnt = bdrv_get_dirty_count(bs); + cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); /* Note that even when no rate limit is applied we need to yield * periodically with no pending I/O so that qemu_aio_flush() returns. * We do so every SLICE_TIME nanoseconds, or when there is an error, * or when the source is clean, whichever comes first. */ - if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME && + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME && s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { @@ -382,8 +403,10 @@ static void coroutine_fn mirror_run(void *opaque) qemu_coroutine_yield(); continue; } else if (cnt != 0) { - mirror_iteration(s); - continue; + delay_ns = mirror_iteration(s); + if (delay_ns == 0) { + continue; + } } } @@ -409,7 +432,7 @@ static void coroutine_fn mirror_run(void *opaque) should_complete = s->should_complete || block_job_is_cancelled(&s->common); - cnt = bdrv_get_dirty_count(bs); + cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); } } @@ -424,28 +447,21 @@ static void coroutine_fn mirror_run(void *opaque) */ trace_mirror_before_drain(s, cnt); bdrv_drain_all(); - cnt = bdrv_get_dirty_count(bs); + cnt = bdrv_get_dirty_count(bs, s->dirty_bitmap); } ret = 0; - trace_mirror_before_sleep(s, cnt, s->synced); + trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); if (!s->synced) { /* Publish progress */ s->common.offset = (end - cnt) * BDRV_SECTOR_SIZE; - - if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, sectors_per_chunk); - } else { - delay_ns = 0; - } - - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } } else if (!should_complete) { delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); } else if (cnt == 0) { /* The two disks are in sync. Exit and report successful * completion. @@ -454,7 +470,7 @@ static void coroutine_fn mirror_run(void *opaque) s->common.cancelled = false; break; } - last_pause_ns = qemu_get_clock_ns(rt_clock); + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); } immediate_exit: @@ -471,16 +487,22 @@ static void coroutine_fn mirror_run(void *opaque) qemu_vfree(s->buf); g_free(s->cow_bitmap); g_free(s->in_flight_bitmap); - bdrv_set_dirty_tracking(bs, 0); + bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); bdrv_iostatus_disable(s->target); if (s->should_complete && ret == 0) { if (bdrv_get_flags(s->target) != bdrv_get_flags(s->common.bs)) { bdrv_reopen(s->target, bdrv_get_flags(s->common.bs), NULL); } bdrv_swap(s->target, s->common.bs); + if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { + /* drop the bs loop chain formed by the swap: break the loop then + * trigger the unref from the top one */ + BlockDriverState *p = s->base->backing_hd; + s->base->backing_hd = NULL; + bdrv_unref(p); + } } - bdrv_close(s->target); - bdrv_delete(s->target); + bdrv_unref(s->target); block_job_completed(&s->common, ret); } @@ -505,14 +527,12 @@ static void mirror_iostatus_reset(BlockJob *job) static void mirror_complete(BlockJob *job, Error **errp) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + Error *local_err = NULL; int ret; - ret = bdrv_open_backing_file(s->target, NULL); + ret = bdrv_open_backing_file(s->target, NULL, &local_err); if (ret < 0) { - char backing_filename[PATH_MAX]; - bdrv_get_full_backing_filename(s->target, backing_filename, - sizeof(backing_filename)); - error_setg_file_open(errp, -ret, backing_filename); + error_propagate(errp, local_err); return; } if (!s->synced) { @@ -524,20 +544,32 @@ static void mirror_complete(BlockJob *job, Error **errp) block_job_resume(job); } -static const BlockJobType mirror_job_type = { +static const BlockJobDriver mirror_job_driver = { .instance_size = sizeof(MirrorBlockJob), - .job_type = "mirror", + .job_type = BLOCK_JOB_TYPE_MIRROR, .set_speed = mirror_set_speed, .iostatus_reset= mirror_iostatus_reset, .complete = mirror_complete, }; -void mirror_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, int64_t granularity, int64_t buf_size, - MirrorSyncMode mode, BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - BlockDriverCompletionFunc *cb, - void *opaque, Error **errp) +static const BlockJobDriver commit_active_job_driver = { + .instance_size = sizeof(MirrorBlockJob), + .job_type = BLOCK_JOB_TYPE_COMMIT, + .set_speed = mirror_set_speed, + .iostatus_reset + = mirror_iostatus_reset, + .complete = mirror_complete, +}; + +static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, int64_t granularity, + int64_t buf_size, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp, + const BlockJobDriver *driver, + bool is_none_mode, BlockDriverState *base) { MirrorBlockJob *s; @@ -562,7 +594,8 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, return; } - s = block_job_create(&mirror_job_type, bs, speed, cb, opaque, errp); + + s = block_job_create(driver, bs, speed, cb, opaque, errp); if (!s) { return; } @@ -570,11 +603,15 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, s->on_source_error = on_source_error; s->on_target_error = on_target_error; s->target = target; - s->mode = mode; + s->is_none_mode = is_none_mode; + s->base = base; s->granularity = granularity; s->buf_size = MAX(buf_size, granularity); - bdrv_set_dirty_tracking(bs, granularity); + s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, errp); + if (!s->dirty_bitmap) { + return; + } bdrv_set_enable_write_cache(s->target, true); bdrv_set_on_error(s->target, on_target_error, on_target_error); bdrv_iostatus_enable(s->target); @@ -582,3 +619,80 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, trace_mirror_start(bs, s, s->common.co, opaque); qemu_coroutine_enter(s->common.co, s); } + +void mirror_start(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, int64_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp) +{ + bool is_none_mode; + BlockDriverState *base; + + is_none_mode = mode == MIRROR_SYNC_MODE_NONE; + base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; + mirror_start_job(bs, target, speed, granularity, buf_size, + on_source_error, on_target_error, cb, opaque, errp, + &mirror_job_driver, is_none_mode, base); +} + +void commit_active_start(BlockDriverState *bs, BlockDriverState *base, + int64_t speed, + BlockdevOnError on_error, + BlockDriverCompletionFunc *cb, + void *opaque, Error **errp) +{ + int64_t length, base_length; + int orig_base_flags; + int ret; + Error *local_err = NULL; + + orig_base_flags = bdrv_get_flags(base); + + if (bdrv_reopen(base, bs->open_flags, errp)) { + return; + } + + length = bdrv_getlength(bs); + if (length < 0) { + error_setg_errno(errp, -length, + "Unable to determine length of %s", bs->filename); + goto error_restore_flags; + } + + base_length = bdrv_getlength(base); + if (base_length < 0) { + error_setg_errno(errp, -base_length, + "Unable to determine length of %s", base->filename); + goto error_restore_flags; + } + + if (length > base_length) { + ret = bdrv_truncate(base, length); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Top image %s is larger than base image %s, and " + "resize of base image failed", + bs->filename, base->filename); + goto error_restore_flags; + } + } + + bdrv_ref(base); + mirror_start_job(bs, base, speed, 0, 0, + on_error, on_error, cb, opaque, &local_err, + &commit_active_job_driver, false, base); + if (local_err) { + error_propagate(errp, local_err); + goto error_restore_flags; + } + + return; + +error_restore_flags: + /* ignore error and errp for bdrv_reopen, because we want to propagate + * the original error */ + bdrv_reopen(base, orig_base_flags, NULL); + return; +} diff --git a/block/nbd-client.c b/block/nbd-client.c new file mode 100644 index 0000000000..7d698cb619 --- /dev/null +++ b/block/nbd-client.c @@ -0,0 +1,388 @@ +/* + * QEMU Block driver for NBD + * + * Copyright (C) 2008 Bull S.A.S. + * Author: Laurent Vivier + * + * Some parts: + * Copyright (C) 2007 Anthony Liguori + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "nbd-client.h" +#include "qemu/sockets.h" + +#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) +#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) + +static void nbd_recv_coroutines_enter_all(NbdClientSession *s) +{ + int i; + + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + } + } +} + +static void nbd_teardown_connection(NbdClientSession *client) +{ + /* finish any pending coroutines */ + shutdown(client->sock, 2); + nbd_recv_coroutines_enter_all(client); + + qemu_aio_set_fd_handler(client->sock, NULL, NULL, NULL); + closesocket(client->sock); + client->sock = -1; +} + +static void nbd_reply_ready(void *opaque) +{ + NbdClientSession *s = opaque; + uint64_t i; + int ret; + + if (s->reply.handle == 0) { + /* No reply already in flight. Fetch a header. It is possible + * that another thread has done the same thing in parallel, so + * the socket is not readable anymore. + */ + ret = nbd_receive_reply(s->sock, &s->reply); + if (ret == -EAGAIN) { + return; + } + if (ret < 0) { + s->reply.handle = 0; + goto fail; + } + } + + /* There's no need for a mutex on the receive side, because the + * handler acts as a synchronization point and ensures that only + * one coroutine is called until the reply finishes. */ + i = HANDLE_TO_INDEX(s, s->reply.handle); + if (i >= MAX_NBD_REQUESTS) { + goto fail; + } + + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + return; + } + +fail: + nbd_teardown_connection(s); +} + +static void nbd_restart_write(void *opaque) +{ + NbdClientSession *s = opaque; + + qemu_coroutine_enter(s->send_coroutine, NULL); +} + +static int nbd_co_send_request(NbdClientSession *s, + struct nbd_request *request, + QEMUIOVector *qiov, int offset) +{ + int rc, ret; + + qemu_co_mutex_lock(&s->send_mutex); + s->send_coroutine = qemu_coroutine_self(); + qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, s); + if (qiov) { + if (!s->is_unix) { + socket_set_cork(s->sock, 1); + } + rc = nbd_send_request(s->sock, request); + if (rc >= 0) { + ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + rc = -EIO; + } + } + if (!s->is_unix) { + socket_set_cork(s->sock, 0); + } + } else { + rc = nbd_send_request(s->sock, request); + } + qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, s); + s->send_coroutine = NULL; + qemu_co_mutex_unlock(&s->send_mutex); + return rc; +} + +static void nbd_co_receive_reply(NbdClientSession *s, + struct nbd_request *request, struct nbd_reply *reply, + QEMUIOVector *qiov, int offset) +{ + int ret; + + /* Wait until we're woken up by the read handler. TODO: perhaps + * peek at the next reply and avoid yielding if it's ours? */ + qemu_coroutine_yield(); + *reply = s->reply; + if (reply->handle != request->handle) { + reply->error = EIO; + } else { + if (qiov && reply->error == 0) { + ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + reply->error = EIO; + } + } + + /* Tell the read handler to read another header. */ + s->reply.handle = 0; + } +} + +static void nbd_coroutine_start(NbdClientSession *s, + struct nbd_request *request) +{ + int i; + + /* Poor man semaphore. The free_sema is locked when no other request + * can be accepted, and unlocked after receiving one reply. */ + if (s->in_flight >= MAX_NBD_REQUESTS - 1) { + qemu_co_mutex_lock(&s->free_sema); + assert(s->in_flight < MAX_NBD_REQUESTS); + } + s->in_flight++; + + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i] == NULL) { + s->recv_coroutine[i] = qemu_coroutine_self(); + break; + } + } + + assert(i < MAX_NBD_REQUESTS); + request->handle = INDEX_TO_HANDLE(s, i); +} + +static void nbd_coroutine_end(NbdClientSession *s, + struct nbd_request *request) +{ + int i = HANDLE_TO_INDEX(s, request->handle); + s->recv_coroutine[i] = NULL; + if (s->in_flight-- == MAX_NBD_REQUESTS) { + qemu_co_mutex_unlock(&s->free_sema); + } +} + +static int nbd_co_readv_1(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + struct nbd_request request = { .type = NBD_CMD_READ }; + struct nbd_reply reply; + ssize_t ret; + + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, qiov, offset); + } + nbd_coroutine_end(client, &request); + return -reply.error; + +} + +static int nbd_co_writev_1(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + struct nbd_request request = { .type = NBD_CMD_WRITE }; + struct nbd_reply reply; + ssize_t ret; + + if (!bdrv_enable_write_cache(client->bs) && + (client->nbdflags & NBD_FLAG_SEND_FUA)) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, qiov, offset); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; +} + +/* qemu-nbd has a limit of slightly less than 1M per request. Try to + * remain aligned to 4K. */ +#define NBD_MAX_SECTORS 2040 + +int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_readv_1(client, sector_num, + NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_readv_1(client, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_writev_1(client, sector_num, + NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_writev_1(client, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_session_co_flush(NbdClientSession *client) +{ + struct nbd_request request = { .type = NBD_CMD_FLUSH }; + struct nbd_reply reply; + ssize_t ret; + + if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) { + return 0; + } + + if (client->nbdflags & NBD_FLAG_SEND_FUA) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = 0; + request.len = 0; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; +} + +int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num, + int nb_sectors) +{ + struct nbd_request request = { .type = NBD_CMD_TRIM }; + struct nbd_reply reply; + ssize_t ret; + + if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { + return 0; + } + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(client, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; + +} + +void nbd_client_session_close(NbdClientSession *client) +{ + struct nbd_request request = { + .type = NBD_CMD_DISC, + .from = 0, + .len = 0 + }; + + if (!client->bs) { + return; + } + if (client->sock == -1) { + return; + } + + nbd_send_request(client->sock, &request); + + nbd_teardown_connection(client); + client->bs = NULL; +} + +int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, + int sock, const char *export) +{ + int ret; + + /* NBD handshake */ + logout("session init %s\n", export); + qemu_set_block(sock); + ret = nbd_receive_negotiate(sock, export, + &client->nbdflags, &client->size, + &client->blocksize); + if (ret < 0) { + logout("Failed to negotiate with the NBD server\n"); + closesocket(sock); + return ret; + } + + qemu_co_mutex_init(&client->send_mutex); + qemu_co_mutex_init(&client->free_sema); + client->bs = bs; + client->sock = sock; + + /* Now that we're connected, set the socket to be non-blocking and + * kick the reply mechanism. */ + qemu_set_nonblock(sock); + qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, client); + + logout("Established connection with NBD server\n"); + return 0; +} diff --git a/block/nbd-client.h b/block/nbd-client.h new file mode 100644 index 0000000000..f2a63378bb --- /dev/null +++ b/block/nbd-client.h @@ -0,0 +1,50 @@ +#ifndef NBD_CLIENT_H +#define NBD_CLIENT_H + +#include "qemu-common.h" +#include "block/nbd.h" +#include "block/block_int.h" + +/* #define DEBUG_NBD */ + +#if defined(DEBUG_NBD) +#define logout(fmt, ...) \ + fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +#define MAX_NBD_REQUESTS 16 + +typedef struct NbdClientSession { + int sock; + uint32_t nbdflags; + off_t size; + size_t blocksize; + + CoMutex send_mutex; + CoMutex free_sema; + Coroutine *send_coroutine; + int in_flight; + + Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; + struct nbd_reply reply; + + bool is_unix; + + BlockDriverState *bs; +} NbdClientSession; + +int nbd_client_session_init(NbdClientSession *client, BlockDriverState *bs, + int sock, const char *export_name); +void nbd_client_session_close(NbdClientSession *client); + +int nbd_client_session_co_discard(NbdClientSession *client, int64_t sector_num, + int nb_sectors); +int nbd_client_session_co_flush(NbdClientSession *client); +int nbd_client_session_co_writev(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); +int nbd_client_session_co_readv(NbdClientSession *client, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); + +#endif /* NBD_CLIENT_H */ diff --git a/block/nbd.c b/block/nbd.c index 9c480b8f26..613f2581ae 100644 --- a/block/nbd.c +++ b/block/nbd.c @@ -26,8 +26,7 @@ * THE SOFTWARE. */ -#include "qemu-common.h" -#include "block/nbd.h" +#include "block/nbd-client.h" #include "qemu/uri.h" #include "block/block_int.h" #include "qemu/module.h" @@ -40,37 +39,9 @@ #define EN_OPTSTR ":exportname=" -/* #define DEBUG_NBD */ - -#if defined(DEBUG_NBD) -#define logout(fmt, ...) \ - fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) -#else -#define logout(fmt, ...) ((void)0) -#endif - -#define MAX_NBD_REQUESTS 16 -#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) -#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) - typedef struct BDRVNBDState { - int sock; - uint32_t nbdflags; - off_t size; - size_t blocksize; - - CoMutex send_mutex; - CoMutex free_sema; - Coroutine *send_coroutine; - int in_flight; - - Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; - struct nbd_reply reply; - - bool is_unix; + NbdClientSession client; QemuOpts *socket_opts; - - char *export_name; /* An NBD server may export several devices */ } BDRVNBDState; static int nbd_parse_uri(const char *filename, QDict *options) @@ -204,7 +175,7 @@ static void nbd_parse_filename(const char *filename, QDict *options, InetSocketAddress *addr = NULL; addr = inet_parse(host_spec, errp); - if (error_is_set(errp)) { + if (!addr) { goto out; } @@ -217,204 +188,49 @@ static void nbd_parse_filename(const char *filename, QDict *options, g_free(file); } -static int nbd_config(BDRVNBDState *s, QDict *options) +static void nbd_config(BDRVNBDState *s, QDict *options, char **export, + Error **errp) { Error *local_err = NULL; - if (qdict_haskey(options, "path")) { - if (qdict_haskey(options, "host")) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "path and host may not " - "be used at the same time."); - return -EINVAL; + if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) { + if (qdict_haskey(options, "path")) { + error_setg(errp, "path and host may not be used at the same time."); + } else { + error_setg(errp, "one of path and host must be specified."); } - s->is_unix = true; - } else if (qdict_haskey(options, "host")) { - s->is_unix = false; - } else { - return -EINVAL; + return; } - s->socket_opts = qemu_opts_create_nofail(&socket_optslist); + s->client.is_unix = qdict_haskey(options, "path"); + s->socket_opts = qemu_opts_create(&socket_optslist, NULL, 0, + &error_abort); qemu_opts_absorb_qdict(s->socket_opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); - return -EINVAL; + if (local_err) { + error_propagate(errp, local_err); + return; } if (!qemu_opt_get(s->socket_opts, "port")) { qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT); } - s->export_name = g_strdup(qdict_get_try_str(options, "export")); - if (s->export_name) { + *export = g_strdup(qdict_get_try_str(options, "export")); + if (*export) { qdict_del(options, "export"); } - - return 0; -} - - -static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request) -{ - int i; - - /* Poor man semaphore. The free_sema is locked when no other request - * can be accepted, and unlocked after receiving one reply. */ - if (s->in_flight >= MAX_NBD_REQUESTS - 1) { - qemu_co_mutex_lock(&s->free_sema); - assert(s->in_flight < MAX_NBD_REQUESTS); - } - s->in_flight++; - - for (i = 0; i < MAX_NBD_REQUESTS; i++) { - if (s->recv_coroutine[i] == NULL) { - s->recv_coroutine[i] = qemu_coroutine_self(); - break; - } - } - - assert(i < MAX_NBD_REQUESTS); - request->handle = INDEX_TO_HANDLE(s, i); -} - -static int nbd_have_request(void *opaque) -{ - BDRVNBDState *s = opaque; - - return s->in_flight > 0; -} - -static void nbd_reply_ready(void *opaque) -{ - BDRVNBDState *s = opaque; - uint64_t i; - int ret; - - if (s->reply.handle == 0) { - /* No reply already in flight. Fetch a header. It is possible - * that another thread has done the same thing in parallel, so - * the socket is not readable anymore. - */ - ret = nbd_receive_reply(s->sock, &s->reply); - if (ret == -EAGAIN) { - return; - } - if (ret < 0) { - s->reply.handle = 0; - goto fail; - } - } - - /* There's no need for a mutex on the receive side, because the - * handler acts as a synchronization point and ensures that only - * one coroutine is called until the reply finishes. */ - i = HANDLE_TO_INDEX(s, s->reply.handle); - if (i >= MAX_NBD_REQUESTS) { - goto fail; - } - - if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); - return; - } - -fail: - for (i = 0; i < MAX_NBD_REQUESTS; i++) { - if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); - } - } -} - -static void nbd_restart_write(void *opaque) -{ - BDRVNBDState *s = opaque; - qemu_coroutine_enter(s->send_coroutine, NULL); -} - -static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request, - QEMUIOVector *qiov, int offset) -{ - int rc, ret; - - qemu_co_mutex_lock(&s->send_mutex); - s->send_coroutine = qemu_coroutine_self(); - qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write, - nbd_have_request, s); - if (qiov) { - if (!s->is_unix) { - socket_set_cork(s->sock, 1); - } - rc = nbd_send_request(s->sock, request); - if (rc >= 0) { - ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, - offset, request->len); - if (ret != request->len) { - rc = -EIO; - } - } - if (!s->is_unix) { - socket_set_cork(s->sock, 0); - } - } else { - rc = nbd_send_request(s->sock, request); - } - qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL, - nbd_have_request, s); - s->send_coroutine = NULL; - qemu_co_mutex_unlock(&s->send_mutex); - return rc; -} - -static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request, - struct nbd_reply *reply, - QEMUIOVector *qiov, int offset) -{ - int ret; - - /* Wait until we're woken up by the read handler. TODO: perhaps - * peek at the next reply and avoid yielding if it's ours? */ - qemu_coroutine_yield(); - *reply = s->reply; - if (reply->handle != request->handle) { - reply->error = EIO; - } else { - if (qiov && reply->error == 0) { - ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, - offset, request->len); - if (ret != request->len) { - reply->error = EIO; - } - } - - /* Tell the read handler to read another header. */ - s->reply.handle = 0; - } -} - -static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request) -{ - int i = HANDLE_TO_INDEX(s, request->handle); - s->recv_coroutine[i] = NULL; - if (s->in_flight-- == MAX_NBD_REQUESTS) { - qemu_co_mutex_unlock(&s->free_sema); - } } -static int nbd_establish_connection(BlockDriverState *bs) +static int nbd_establish_connection(BlockDriverState *bs, Error **errp) { BDRVNBDState *s = bs->opaque; int sock; - int ret; - off_t size; - size_t blocksize; - if (s->is_unix) { - sock = unix_socket_outgoing(qemu_opt_get(s->socket_opts, "path")); + if (s->client.is_unix) { + sock = unix_connect_opts(s->socket_opts, errp, NULL, NULL); } else { - sock = tcp_socket_outgoing_opts(s->socket_opts); + sock = inet_connect_opts(s->socket_opts, errp, NULL, NULL); if (sock >= 0) { socket_set_nodelay(sock); } @@ -426,226 +242,85 @@ static int nbd_establish_connection(BlockDriverState *bs) return -errno; } - /* NBD handshake */ - ret = nbd_receive_negotiate(sock, s->export_name, &s->nbdflags, &size, - &blocksize); - if (ret < 0) { - logout("Failed to negotiate with the NBD server\n"); - closesocket(sock); - return ret; - } - - /* Now that we're connected, set the socket to be non-blocking and - * kick the reply mechanism. */ - qemu_set_nonblock(sock); - qemu_aio_set_fd_handler(sock, nbd_reply_ready, NULL, - nbd_have_request, s); - - s->sock = sock; - s->size = size; - s->blocksize = blocksize; - - logout("Established connection with NBD server\n"); - return 0; -} - -static void nbd_teardown_connection(BlockDriverState *bs) -{ - BDRVNBDState *s = bs->opaque; - struct nbd_request request; - - request.type = NBD_CMD_DISC; - request.from = 0; - request.len = 0; - nbd_send_request(s->sock, &request); - - qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL); - closesocket(s->sock); + return sock; } -static int nbd_open(BlockDriverState *bs, QDict *options, int flags) +static int nbd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVNBDState *s = bs->opaque; - int result; - - qemu_co_mutex_init(&s->send_mutex); - qemu_co_mutex_init(&s->free_sema); + char *export = NULL; + int result, sock; + Error *local_err = NULL; /* Pop the config into our state object. Exit if invalid. */ - result = nbd_config(s, options); - if (result != 0) { - return result; + nbd_config(s, options, &export, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; } /* establish TCP connection, return error if it fails * TODO: Configurable retry-until-timeout behaviour. */ - result = nbd_establish_connection(bs); - - return result; -} - -static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset) -{ - BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - - request.type = NBD_CMD_READ; - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, qiov, offset); - } - nbd_coroutine_end(s, &request); - return -reply.error; - -} - -static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset) -{ - BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - - request.type = NBD_CMD_WRITE; - if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) { - request.type |= NBD_CMD_FLAG_FUA; + sock = nbd_establish_connection(bs, errp); + if (sock < 0) { + return sock; } - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, qiov, offset); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, NULL, 0); - } - nbd_coroutine_end(s, &request); - return -reply.error; + /* NBD handshake */ + result = nbd_client_session_init(&s->client, bs, sock, export); + g_free(export); + return result; } -/* qemu-nbd has a limit of slightly less than 1M per request. Try to - * remain aligned to 4K. */ -#define NBD_MAX_SECTORS 2040 - static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); + BDRVNBDState *s = bs->opaque; + + return nbd_client_session_co_readv(&s->client, sector_num, + nb_sectors, qiov); } static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset); + BDRVNBDState *s = bs->opaque; + + return nbd_client_session_co_writev(&s->client, sector_num, + nb_sectors, qiov); } static int nbd_co_flush(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - - if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) { - return 0; - } - - request.type = NBD_CMD_FLUSH; - if (s->nbdflags & NBD_FLAG_SEND_FUA) { - request.type |= NBD_CMD_FLAG_FUA; - } - - request.from = 0; - request.len = 0; - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, NULL, 0); - } - nbd_coroutine_end(s, &request); - return -reply.error; + return nbd_client_session_co_flush(&s->client); } static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { BDRVNBDState *s = bs->opaque; - struct nbd_request request; - struct nbd_reply reply; - ssize_t ret; - if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) { - return 0; - } - request.type = NBD_CMD_TRIM; - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(s, &request); - ret = nbd_co_send_request(s, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(s, &request, &reply, NULL, 0); - } - nbd_coroutine_end(s, &request); - return -reply.error; + return nbd_client_session_co_discard(&s->client, sector_num, + nb_sectors); } static void nbd_close(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; - g_free(s->export_name); - qemu_opts_del(s->socket_opts); - nbd_teardown_connection(bs); + qemu_opts_del(s->socket_opts); + nbd_client_session_close(&s->client); } static int64_t nbd_getlength(BlockDriverState *bs) { BDRVNBDState *s = bs->opaque; - return s->size; + return s->client.size; } static BlockDriver bdrv_nbd = { diff --git a/block/nfs.c b/block/nfs.c new file mode 100644 index 0000000000..539bd951df --- /dev/null +++ b/block/nfs.c @@ -0,0 +1,446 @@ +/* + * QEMU Block driver for native access to files on NFS shares + * + * Copyright (c) 2014 Peter Lieven + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" + +#include +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "block/block_int.h" +#include "trace.h" +#include "qemu/iov.h" +#include "qemu/uri.h" +#include "sysemu/sysemu.h" +#include + +typedef struct NFSClient { + struct nfs_context *context; + struct nfsfh *fh; + int events; + bool has_zero_init; +} NFSClient; + +typedef struct NFSRPC { + int ret; + int complete; + QEMUIOVector *iov; + struct stat *st; + Coroutine *co; + QEMUBH *bh; +} NFSRPC; + +static void nfs_process_read(void *arg); +static void nfs_process_write(void *arg); + +static void nfs_set_events(NFSClient *client) +{ + int ev = nfs_which_events(client->context); + if (ev != client->events) { + qemu_aio_set_fd_handler(nfs_get_fd(client->context), + (ev & POLLIN) ? nfs_process_read : NULL, + (ev & POLLOUT) ? nfs_process_write : NULL, + client); + + } + client->events = ev; +} + +static void nfs_process_read(void *arg) +{ + NFSClient *client = arg; + nfs_service(client->context, POLLIN); + nfs_set_events(client); +} + +static void nfs_process_write(void *arg) +{ + NFSClient *client = arg; + nfs_service(client->context, POLLOUT); + nfs_set_events(client); +} + +static void nfs_co_init_task(NFSClient *client, NFSRPC *task) +{ + *task = (NFSRPC) { + .co = qemu_coroutine_self(), + }; +} + +static void nfs_co_generic_bh_cb(void *opaque) +{ + NFSRPC *task = opaque; + qemu_bh_delete(task->bh); + qemu_coroutine_enter(task->co, NULL); +} + +static void +nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, + void *private_data) +{ + NFSRPC *task = private_data; + task->complete = 1; + task->ret = ret; + if (task->ret > 0 && task->iov) { + if (task->ret <= task->iov->size) { + qemu_iovec_from_buf(task->iov, 0, data, task->ret); + } else { + task->ret = -EIO; + } + } + if (task->ret == 0 && task->st) { + memcpy(task->st, data, sizeof(struct stat)); + } + if (task->ret < 0) { + error_report("NFS Error: %s", nfs_get_error(nfs)); + } + if (task->co) { + task->bh = qemu_bh_new(nfs_co_generic_bh_cb, task); + qemu_bh_schedule(task->bh); + } +} + +static int coroutine_fn nfs_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + + nfs_co_init_task(client, &task); + task.iov = iov; + + if (nfs_pread_async(client->context, client->fh, + sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, + nfs_co_generic_cb, &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + if (task.ret < 0) { + return task.ret; + } + + /* zero pad short reads */ + if (task.ret < iov->size) { + qemu_iovec_memset(iov, task.ret, 0, iov->size - task.ret); + } + + return 0; +} + +static int coroutine_fn nfs_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + char *buf = NULL; + + nfs_co_init_task(client, &task); + + buf = g_malloc(nb_sectors * BDRV_SECTOR_SIZE); + qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE); + + if (nfs_pwrite_async(client->context, client->fh, + sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, + buf, nfs_co_generic_cb, &task) != 0) { + g_free(buf); + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + g_free(buf); + + if (task.ret != nb_sectors * BDRV_SECTOR_SIZE) { + return task.ret < 0 ? task.ret : -EIO; + } + + return 0; +} + +static int coroutine_fn nfs_co_flush(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + + nfs_co_init_task(client, &task); + + if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb, + &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + return task.ret; +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "nfs", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the NFS file", + }, + { /* end of list */ } + }, +}; + +static void nfs_client_close(NFSClient *client) +{ + if (client->context) { + if (client->fh) { + nfs_close(client->context, client->fh); + } + qemu_aio_set_fd_handler(nfs_get_fd(client->context), NULL, NULL, NULL); + nfs_destroy_context(client->context); + } + memset(client, 0, sizeof(NFSClient)); +} + +static void nfs_file_close(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + nfs_client_close(client); +} + +static int64_t nfs_client_open(NFSClient *client, const char *filename, + int flags, Error **errp) +{ + int ret = -EINVAL, i; + struct stat st; + URI *uri; + QueryParams *qp = NULL; + char *file = NULL, *strp = NULL; + + uri = uri_parse(filename); + if (!uri) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + if (!uri->server) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + strp = strrchr(uri->path, '/'); + if (strp == NULL) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + file = g_strdup(strp); + *strp = 0; + + client->context = nfs_init_context(); + if (client->context == NULL) { + error_setg(errp, "Failed to init NFS context"); + goto fail; + } + + qp = query_params_parse(uri->query); + for (i = 0; i < qp->n; i++) { + if (!qp->p[i].value) { + error_setg(errp, "Value for NFS parameter expected: %s", + qp->p[i].name); + goto fail; + } + if (!strncmp(qp->p[i].name, "uid", 3)) { + nfs_set_uid(client->context, atoi(qp->p[i].value)); + } else if (!strncmp(qp->p[i].name, "gid", 3)) { + nfs_set_gid(client->context, atoi(qp->p[i].value)); + } else if (!strncmp(qp->p[i].name, "tcp-syncnt", 10)) { + nfs_set_tcp_syncnt(client->context, atoi(qp->p[i].value)); + } else { + error_setg(errp, "Unknown NFS parameter name: %s", + qp->p[i].name); + goto fail; + } + } + + ret = nfs_mount(client->context, uri->server, uri->path); + if (ret < 0) { + error_setg(errp, "Failed to mount nfs share: %s", + nfs_get_error(client->context)); + goto fail; + } + + if (flags & O_CREAT) { + ret = nfs_creat(client->context, file, 0600, &client->fh); + if (ret < 0) { + error_setg(errp, "Failed to create file: %s", + nfs_get_error(client->context)); + goto fail; + } + } else { + ret = nfs_open(client->context, file, flags, &client->fh); + if (ret < 0) { + error_setg(errp, "Failed to open file : %s", + nfs_get_error(client->context)); + goto fail; + } + } + + ret = nfs_fstat(client->context, client->fh, &st); + if (ret < 0) { + error_setg(errp, "Failed to fstat file: %s", + nfs_get_error(client->context)); + goto fail; + } + + ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE); + client->has_zero_init = S_ISREG(st.st_mode); + goto out; +fail: + nfs_client_close(client); +out: + if (qp) { + query_params_free(qp); + } + uri_free(uri); + g_free(file); + return ret; +} + +static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { + NFSClient *client = bs->opaque; + int64_t ret; + QemuOpts *opts; + Error *local_err = NULL; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; + } + ret = nfs_client_open(client, qemu_opt_get(opts, "filename"), + (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY, + errp); + if (ret < 0) { + return ret; + } + bs->total_sectors = ret; + return 0; +} + +static int nfs_file_create(const char *url, QEMUOptionParameter *options, + Error **errp) +{ + int ret = 0; + int64_t total_size = 0; + NFSClient *client = g_malloc0(sizeof(NFSClient)); + + /* Read out options */ + while (options && options->name) { + if (!strcmp(options->name, "size")) { + total_size = options->value.n; + } + options++; + } + + ret = nfs_client_open(client, url, O_CREAT, errp); + if (ret < 0) { + goto out; + } + ret = nfs_ftruncate(client->context, client->fh, total_size); + nfs_client_close(client); +out: + g_free(client); + return ret; +} + +static int nfs_has_zero_init(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + return client->has_zero_init; +} + +static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + NFSRPC task = {0}; + struct stat st; + + task.st = &st; + if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb, + &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_aio_wait(); + } + + return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize); +} + +static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) +{ + NFSClient *client = bs->opaque; + return nfs_ftruncate(client->context, client->fh, offset); +} + +static BlockDriver bdrv_nfs = { + .format_name = "nfs", + .protocol_name = "nfs", + + .instance_size = sizeof(NFSClient), + .bdrv_needs_filename = true, + .bdrv_has_zero_init = nfs_has_zero_init, + .bdrv_get_allocated_file_size = nfs_get_allocated_file_size, + .bdrv_truncate = nfs_file_truncate, + + .bdrv_file_open = nfs_file_open, + .bdrv_close = nfs_file_close, + .bdrv_create = nfs_file_create, + + .bdrv_co_readv = nfs_co_readv, + .bdrv_co_writev = nfs_co_writev, + .bdrv_co_flush_to_disk = nfs_co_flush, +}; + +static void nfs_block_init(void) +{ + bdrv_register(&bdrv_nfs); +} + +block_init(nfs_block_init); diff --git a/block/parallels.c b/block/parallels.c index 18b3ac0b28..1a5bd350b3 100644 --- a/block/parallels.c +++ b/block/parallels.c @@ -49,9 +49,9 @@ typedef struct BDRVParallelsState { CoMutex lock; uint32_t *catalog_bitmap; - int catalog_size; + unsigned int catalog_size; - int tracks; + unsigned int tracks; } BDRVParallelsState; static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename) @@ -68,7 +68,8 @@ static int parallels_probe(const uint8_t *buf, int buf_size, const char *filenam return 0; } -static int parallels_open(BlockDriverState *bs, QDict *options, int flags) +static int parallels_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVParallelsState *s = bs->opaque; int i; @@ -84,15 +85,26 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags) if (memcmp(ph.magic, HEADER_MAGIC, 16) || (le32_to_cpu(ph.version) != HEADER_VERSION)) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in Parallels format"); + ret = -EINVAL; goto fail; } bs->total_sectors = le32_to_cpu(ph.nb_sectors); s->tracks = le32_to_cpu(ph.tracks); + if (s->tracks == 0) { + error_setg(errp, "Invalid image: Zero sectors per track"); + ret = -EINVAL; + goto fail; + } s->catalog_size = le32_to_cpu(ph.catalog_entries); + if (s->catalog_size > INT_MAX / 4) { + error_setg(errp, "Catalog too large"); + ret = -EFBIG; + goto fail; + } s->catalog_bitmap = g_malloc(s->catalog_size * 4); ret = bdrv_pread(bs->file, 64, s->catalog_bitmap, s->catalog_size * 4); diff --git a/block/qapi.c b/block/qapi.c index a4bc4113b7..af114452e0 100644 --- a/block/qapi.c +++ b/block/qapi.c @@ -25,6 +25,63 @@ #include "block/qapi.h" #include "block/block_int.h" #include "qmp-commands.h" +#include "qapi-visit.h" +#include "qapi/qmp-output-visitor.h" +#include "qapi/qmp/types.h" + +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs) +{ + BlockDeviceInfo *info = g_malloc0(sizeof(*info)); + + info->file = g_strdup(bs->filename); + info->ro = bs->read_only; + info->drv = g_strdup(bs->drv->format_name); + info->encrypted = bs->encrypted; + info->encryption_key_missing = bdrv_key_required(bs); + + if (bs->node_name[0]) { + info->has_node_name = true; + info->node_name = g_strdup(bs->node_name); + } + + if (bs->backing_file[0]) { + info->has_backing_file = true; + info->backing_file = g_strdup(bs->backing_file); + } + + info->backing_file_depth = bdrv_get_backing_file_depth(bs); + + if (bs->io_limits_enabled) { + ThrottleConfig cfg; + throttle_get_config(&bs->throttle_state, &cfg); + info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; + info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; + info->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg; + + info->iops = cfg.buckets[THROTTLE_OPS_TOTAL].avg; + info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg; + info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg; + + info->has_bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->has_bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->has_bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + info->bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + + info->has_iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + info->iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + + info->has_iops_size = cfg.op_size; + info->iops_size = cfg.op_size; + } + + return info; +} /* * Returns 0 on success, with *p_list either set to describe snapshot @@ -134,6 +191,9 @@ void bdrv_query_image_info(BlockDriverState *bs, info->dirty_flag = bdi.is_dirty; info->has_dirty_flag = true; } + info->format_specific = bdrv_get_specific_info(bs); + info->has_format_specific = info->format_specific != NULL; + backing_filename = bs->backing_file; if (backing_filename[0] != '\0') { info->backing_filename = g_strdup(backing_filename); @@ -198,50 +258,20 @@ void bdrv_query_info(BlockDriverState *bs, info->io_status = bs->iostatus; } - if (bs->dirty_bitmap) { - info->has_dirty = true; - info->dirty = g_malloc0(sizeof(*info->dirty)); - info->dirty->count = bdrv_get_dirty_count(bs) * BDRV_SECTOR_SIZE; - info->dirty->granularity = - ((int64_t) BDRV_SECTOR_SIZE << hbitmap_granularity(bs->dirty_bitmap)); + if (!QLIST_EMPTY(&bs->dirty_bitmaps)) { + info->has_dirty_bitmaps = true; + info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs); } if (bs->drv) { info->has_inserted = true; - info->inserted = g_malloc0(sizeof(*info->inserted)); - info->inserted->file = g_strdup(bs->filename); - info->inserted->ro = bs->read_only; - info->inserted->drv = g_strdup(bs->drv->format_name); - info->inserted->encrypted = bs->encrypted; - info->inserted->encryption_key_missing = bdrv_key_required(bs); - - if (bs->backing_file[0]) { - info->inserted->has_backing_file = true; - info->inserted->backing_file = g_strdup(bs->backing_file); - } - - info->inserted->backing_file_depth = bdrv_get_backing_file_depth(bs); - - if (bs->io_limits_enabled) { - info->inserted->bps = - bs->io_limits.bps[BLOCK_IO_LIMIT_TOTAL]; - info->inserted->bps_rd = - bs->io_limits.bps[BLOCK_IO_LIMIT_READ]; - info->inserted->bps_wr = - bs->io_limits.bps[BLOCK_IO_LIMIT_WRITE]; - info->inserted->iops = - bs->io_limits.iops[BLOCK_IO_LIMIT_TOTAL]; - info->inserted->iops_rd = - bs->io_limits.iops[BLOCK_IO_LIMIT_READ]; - info->inserted->iops_wr = - bs->io_limits.iops[BLOCK_IO_LIMIT_WRITE]; - } + info->inserted = bdrv_block_device_info(bs); bs0 = bs; p_image_info = &info->inserted->image; while (1) { bdrv_query_image_info(bs0, p_image_info, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); goto err; } @@ -289,6 +319,11 @@ BlockStats *bdrv_query_stats(const BlockDriverState *bs) s->parent = bdrv_query_stats(bs->file); } + if (bs->backing_hd) { + s->has_backing = true; + s->backing = bdrv_query_stats(bs->backing_hd); + } + return s; } @@ -301,7 +336,7 @@ BlockInfoList *qmp_query_block(Error **errp) while ((bs = bdrv_next(bs))) { BlockInfoList *info = g_malloc0(sizeof(*info)); bdrv_query_info(bs, &info->value, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); goto err; } @@ -397,6 +432,118 @@ void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f, } } +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, + QDict *dict); +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, + QList *list); + +static void dump_qobject(fprintf_function func_fprintf, void *f, + int comp_indent, QObject *obj) +{ + switch (qobject_type(obj)) { + case QTYPE_QINT: { + QInt *value = qobject_to_qint(obj); + func_fprintf(f, "%" PRId64, qint_get_int(value)); + break; + } + case QTYPE_QSTRING: { + QString *value = qobject_to_qstring(obj); + func_fprintf(f, "%s", qstring_get_str(value)); + break; + } + case QTYPE_QDICT: { + QDict *value = qobject_to_qdict(obj); + dump_qdict(func_fprintf, f, comp_indent, value); + break; + } + case QTYPE_QLIST: { + QList *value = qobject_to_qlist(obj); + dump_qlist(func_fprintf, f, comp_indent, value); + break; + } + case QTYPE_QFLOAT: { + QFloat *value = qobject_to_qfloat(obj); + func_fprintf(f, "%g", qfloat_get_double(value)); + break; + } + case QTYPE_QBOOL: { + QBool *value = qobject_to_qbool(obj); + func_fprintf(f, "%s", qbool_get_int(value) ? "true" : "false"); + break; + } + case QTYPE_QERROR: { + QString *value = qerror_human((QError *)obj); + func_fprintf(f, "%s", qstring_get_str(value)); + break; + } + case QTYPE_NONE: + break; + case QTYPE_MAX: + default: + abort(); + } +} + +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, + QList *list) +{ + const QListEntry *entry; + int i = 0; + + for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) { + qtype_code type = qobject_type(entry->value); + bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); + const char *format = composite ? "%*s[%i]:\n" : "%*s[%i]: "; + + func_fprintf(f, format, indentation * 4, "", i); + dump_qobject(func_fprintf, f, indentation + 1, entry->value); + if (!composite) { + func_fprintf(f, "\n"); + } + } +} + +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, + QDict *dict) +{ + const QDictEntry *entry; + + for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) { + qtype_code type = qobject_type(entry->value); + bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); + const char *format = composite ? "%*s%s:\n" : "%*s%s: "; + char key[strlen(entry->key) + 1]; + int i; + + /* replace dashes with spaces in key (variable) names */ + for (i = 0; entry->key[i]; i++) { + key[i] = entry->key[i] == '-' ? ' ' : entry->key[i]; + } + key[i] = 0; + + func_fprintf(f, format, indentation * 4, "", key); + dump_qobject(func_fprintf, f, indentation + 1, entry->value); + if (!composite) { + func_fprintf(f, "\n"); + } + } +} + +void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, + ImageInfoSpecific *info_spec) +{ + QmpOutputVisitor *ov = qmp_output_visitor_new(); + QObject *obj, *data; + + visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL, + &error_abort); + obj = qmp_output_get_qobject(ov); + assert(qobject_type(obj) == QTYPE_QDICT); + data = qdict_get(qobject_to_qdict(obj), "data"); + dump_qobject(func_fprintf, f, 1, data); + qmp_output_visitor_cleanup(ov); +} + void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, ImageInfo *info) { @@ -467,4 +614,9 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, func_fprintf(f, "\n"); } } + + if (info->has_format_specific) { + func_fprintf(f, "Format specific information:\n"); + bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific); + } } diff --git a/block/qcow.c b/block/qcow.c index 5239bd68f1..937dd6dd1c 100644 --- a/block/qcow.c +++ b/block/qcow.c @@ -92,7 +92,8 @@ static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int qcow_open(BlockDriverState *bs, QDict *options, int flags) +static int qcow_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVQcowState *s = bs->opaque; int len, i, shift, ret; @@ -112,23 +113,27 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags) be64_to_cpus(&header.l1_table_offset); if (header.magic != QCOW_MAGIC) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in qcow format"); + ret = -EINVAL; goto fail; } if (header.version != QCOW_VERSION) { char version[64]; - snprintf(version, sizeof(version), "QCOW version %d", header.version); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "qcow", version); + snprintf(version, sizeof(version), "QCOW version %" PRIu32, + header.version); + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "qcow", version); ret = -ENOTSUP; goto fail; } if (header.size <= 1 || header.cluster_bits < 9) { + error_setg(errp, "invalid value in qcow header"); ret = -EINVAL; goto fail; } if (header.crypt_method > QCOW_CRYPT_AES) { + error_setg(errp, "invalid encryption method in qcow header"); ret = -EINVAL; goto fail; } @@ -395,7 +400,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, return cluster_offset; } -static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVQcowState *s = bs->opaque; @@ -410,7 +415,14 @@ static int coroutine_fn qcow_co_is_allocated(BlockDriverState *bs, if (n > nb_sectors) n = nb_sectors; *pnum = n; - return (cluster_offset != 0); + if (!cluster_offset) { + return 0; + } + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->crypt_method) { + return BDRV_BLOCK_DATA; + } + cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset; } static int decompress_buffer(uint8_t *out_buf, int out_buf_size, @@ -651,7 +663,8 @@ static void qcow_close(BlockDriverState *bs) error_free(s->migration_blocker); } -static int qcow_create(const char *filename, QEMUOptionParameter *options) +static int qcow_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int header_size, backing_filename_len, l1_size, shift, i; QCowHeader header; @@ -659,6 +672,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) int64_t total_size = 0; const char *backing_file = NULL; int flags = 0; + Error *local_err = NULL; int ret; BlockDriverState *qcow_bs; @@ -674,13 +688,17 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) options++; } - ret = bdrv_create_file(filename, options); + ret = bdrv_create_file(filename, options, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&qcow_bs, filename, NULL, BDRV_O_RDWR); + qcow_bs = NULL; + ret = bdrv_open(&qcow_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } @@ -706,7 +724,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) backing_file = NULL; } header.cluster_bits = 9; /* 512 byte cluster to avoid copying - unmodifyed sectors */ + unmodified sectors */ header.l2_bits = 12; /* 32 KB L2 tables */ } else { header.cluster_bits = 12; /* 4 KB clusters */ @@ -751,7 +769,7 @@ static int qcow_create(const char *filename, QEMUOptionParameter *options) g_free(tmp); ret = 0; exit: - bdrv_delete(qcow_bs); + bdrv_unref(qcow_bs); return ret; } @@ -896,7 +914,7 @@ static BlockDriver bdrv_qcow = { .bdrv_co_readv = qcow_co_readv, .bdrv_co_writev = qcow_co_writev, - .bdrv_co_is_allocated = qcow_co_is_allocated, + .bdrv_co_get_block_status = qcow_co_get_block_status, .bdrv_set_key = qcow_set_key, .bdrv_make_empty = qcow_make_empty, diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c index 2f3114ecc2..8ecbb5bc00 100644 --- a/block/qcow2-cache.c +++ b/block/qcow2-cache.c @@ -114,6 +114,21 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) return ret; } + if (c == s->refcount_block_cache) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, + c->entries[i].offset, s->cluster_size); + } else if (c == s->l2_table_cache) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, + c->entries[i].offset, s->cluster_size); + } else { + ret = qcow2_pre_write_overlap_check(bs, 0, + c->entries[i].offset, s->cluster_size); + } + + if (ret < 0) { + return ret; + } + if (c == s->refcount_block_cache) { BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); } else if (c == s->l2_table_cache) { @@ -185,6 +200,24 @@ void qcow2_cache_depends_on_flush(Qcow2Cache *c) c->depends_on_flush = true; } +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) +{ + int ret, i; + + ret = qcow2_cache_flush(bs, c); + if (ret < 0) { + return ret; + } + + for (i = 0; i < c->size; i++) { + assert(c->entries[i].ref == 0); + c->entries[i].offset = 0; + c->entries[i].cache_hits = 0; + } + + return 0; +} + static int qcow2_cache_find_entry_to_replace(Qcow2Cache *c) { int i; diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c index cca76d4fcd..76d2bcf63a 100644 --- a/block/qcow2-cluster.c +++ b/block/qcow2-cluster.c @@ -35,12 +35,20 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, BDRVQcowState *s = bs->opaque; int new_l1_size2, ret, i; uint64_t *new_l1_table; + int64_t old_l1_table_offset, old_l1_size; int64_t new_l1_table_offset, new_l1_size; uint8_t data[12]; if (min_size <= s->l1_size) return 0; + /* Do a sanity check on min_size before trying to calculate new_l1_size + * (this prevents overflows during the while loop for the calculation of + * new_l1_size) */ + if (min_size > INT_MAX / sizeof(uint64_t)) { + return -EFBIG; + } + if (exact_size) { new_l1_size = min_size; } else { @@ -54,7 +62,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, } } - if (new_l1_size > INT_MAX) { + if (new_l1_size > INT_MAX / sizeof(uint64_t)) { return -EFBIG; } @@ -80,6 +88,14 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, goto fail; } + /* the L1 position has not yet been updated, so these clusters must + * indeed be completely free */ + ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, + new_l1_size2); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); for(i = 0; i < s->l1_size; i++) new_l1_table[i] = cpu_to_be64(new_l1_table[i]); @@ -92,17 +108,19 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, /* set new table */ BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); cpu_to_be32w((uint32_t*)data, new_l1_size); - cpu_to_be64wu((uint64_t*)(data + 4), new_l1_table_offset); + stq_be_p(data + 4, new_l1_table_offset); ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); if (ret < 0) { goto fail; } g_free(s->l1_table); - qcow2_free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); + old_l1_table_offset = s->l1_table_offset; s->l1_table_offset = new_l1_table_offset; s->l1_table = new_l1_table; + old_l1_size = s->l1_size; s->l1_size = new_l1_size; + qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); return 0; fail: g_free(new_l1_table); @@ -137,7 +155,7 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset, * and we really don't want bdrv_pread to perform a read-modify-write) */ #define L1_ENTRIES_PER_SECTOR (512 / 8) -static int write_l1_entry(BlockDriverState *bs, int l1_index) +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) { BDRVQcowState *s = bs->opaque; uint64_t buf[L1_ENTRIES_PER_SECTOR]; @@ -149,6 +167,12 @@ static int write_l1_entry(BlockDriverState *bs, int l1_index) buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); } + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, + s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); + if (ret < 0) { + return ret; + } + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, buf, sizeof(buf)); @@ -173,7 +197,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) { BDRVQcowState *s = bs->opaque; uint64_t old_l2_offset; - uint64_t *l2_table; + uint64_t *l2_table = NULL; int64_t l2_offset; int ret; @@ -185,7 +209,8 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); if (l2_offset < 0) { - return l2_offset; + ret = l2_offset; + goto fail; } ret = qcow2_cache_flush(bs, s->refcount_block_cache); @@ -198,7 +223,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) trace_qcow2_l2_allocate_get_empty(bs, l1_index); ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); if (ret < 0) { - return ret; + goto fail; } l2_table = *table; @@ -239,7 +264,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) /* update the L1 entry */ trace_qcow2_l2_allocate_write_l1(bs, l1_index); s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; - ret = write_l1_entry(bs, l1_index); + ret = qcow2_write_l1_entry(bs, l1_index); if (ret < 0) { goto fail; } @@ -250,8 +275,14 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) fail: trace_qcow2_l2_allocate_done(bs, l1_index, ret); - qcow2_cache_put(bs, s->l2_table_cache, (void**) table); + if (l2_table != NULL) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) table); + } s->l1_table[l1_index] = old_l2_offset; + if (l2_offset > 0) { + qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), + QCOW2_DISCARD_ALWAYS); + } return ret; } @@ -263,23 +294,26 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) * cluster which may require a different handling) */ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, - uint64_t *l2_table, uint64_t start, uint64_t stop_flags) + uint64_t *l2_table, uint64_t stop_flags) { int i; - uint64_t mask = stop_flags | L2E_OFFSET_MASK; - uint64_t offset = be64_to_cpu(l2_table[0]) & mask; + uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; + uint64_t first_entry = be64_to_cpu(l2_table[0]); + uint64_t offset = first_entry & mask; if (!offset) return 0; - for (i = start; i < start + nb_clusters; i++) { + assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED); + + for (i = 0; i < nb_clusters; i++) { uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; if (offset + (uint64_t) i * cluster_size != l2_entry) { break; } } - return (i - start); + return i; } static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) @@ -332,15 +366,6 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, struct iovec iov; int n, ret; - /* - * If this is the last cluster and it is only partially used, we must only - * copy until the end of the image, or bdrv_check_request will fail for the - * bdrv_read/write calls below. - */ - if (start_sect + n_end > bs->total_sectors) { - n_end = bs->total_sectors - start_sect; - } - n = n_end - n_start; if (n <= 0) { return 0; @@ -353,6 +378,10 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + if (!bs->drv) { + return -ENOMEDIUM; + } + /* Call .bdrv_co_readv() directly instead of using the public block-layer * interface. This avoids double I/O throttling and request tracking, * which can lead to deadlock when block layer copy-on-read is enabled. @@ -368,6 +397,12 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, &s->aes_encrypt_key); } + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto out; + } + BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); if (ret < 0) { @@ -463,11 +498,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, break; case QCOW2_CLUSTER_ZERO: if (s->qcow_version < 3) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); return -EIO; } c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + &l2_table[l2_index], QCOW_OFLAG_ZERO); *cluster_offset = 0; break; case QCOW2_CLUSTER_UNALLOCATED: @@ -478,8 +513,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, case QCOW2_CLUSTER_NORMAL: /* how many allocated clusters ? */ c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, - QCOW_OFLAG_COMPRESSED | QCOW_OFLAG_ZERO); + &l2_table[l2_index], QCOW_OFLAG_ZERO); *cluster_offset &= L2E_OFFSET_MASK; break; default: @@ -695,6 +729,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) } qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + assert(l2_index + m->nb_clusters <= s->l2_size); for (i = 0; i < m->nb_clusters; i++) { /* if two concurrent writes happen to the same unallocated cluster * each write allocates separate cluster and writes data concurrently. @@ -908,7 +943,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, /* We keep all QCOW_OFLAG_COPIED clusters */ keep_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], 0, + &l2_table[l2_index], QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); assert(keep_clusters <= nb_clusters); @@ -1150,7 +1185,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, * Return 0 on success and -errno in error cases */ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m) + int *num, uint64_t *host_offset, QCowL2Meta **m) { BDRVQcowState *s = bs->opaque; uint64_t start, remaining; @@ -1158,15 +1193,13 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, uint64_t cur_bytes; int ret; - trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, - n_start, n_end); + trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num); - assert(n_start * BDRV_SECTOR_SIZE == offset_into_cluster(s, offset)); - offset = start_of_cluster(s, offset); + assert((offset & ~BDRV_SECTOR_MASK) == 0); again: - start = offset + (n_start << BDRV_SECTOR_BITS); - remaining = (n_end - n_start) << BDRV_SECTOR_BITS; + start = offset; + remaining = *num << BDRV_SECTOR_BITS; cluster_offset = 0; *host_offset = 0; cur_bytes = 0; @@ -1252,7 +1285,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, } } - *num = (n_end - n_start) - (remaining >> BDRV_SECTOR_BITS); + *num -= remaining >> BDRV_SECTOR_BITS; assert(*num > 0); assert(*host_offset != 0); @@ -1317,7 +1350,7 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) * clusters. */ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters) + unsigned int nb_clusters, enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; uint64_t *l2_table; @@ -1334,19 +1367,47 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); for (i = 0; i < nb_clusters; i++) { - uint64_t old_offset; + uint64_t old_l2_entry; - old_offset = be64_to_cpu(l2_table[l2_index + i]); - if ((old_offset & L2E_OFFSET_MASK) == 0) { - continue; + old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); + + /* + * Make sure that a discarded area reads back as zeroes for v3 images + * (we cannot do it for v2 without actually writing a zero-filled + * buffer). We can skip the operation if the cluster is already marked + * as zero, or if it's unallocated and we don't have a backing file. + * + * TODO We might want to use bdrv_get_block_status(bs) here, but we're + * holding s->lock, so that doesn't work today. + */ + switch (qcow2_get_cluster_type(old_l2_entry)) { + case QCOW2_CLUSTER_UNALLOCATED: + if (!bs->backing_hd) { + continue; + } + break; + + case QCOW2_CLUSTER_ZERO: + continue; + + case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_COMPRESSED: + break; + + default: + abort(); } /* First remove L2 entries */ qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); - l2_table[l2_index + i] = cpu_to_be64(0); + if (s->qcow_version >= 3) { + l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); + } else { + l2_table[l2_index + i] = cpu_to_be64(0); + } /* Then decrease the refcount */ - qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); + qcow2_free_any_clusters(bs, old_l2_entry, 1, type); } ret = qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); @@ -1358,7 +1419,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, } int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors) + int nb_sectors, enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; uint64_t end_offset; @@ -1369,7 +1430,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, /* Round start up and end down */ offset = align_offset(offset, s->cluster_size); - end_offset &= ~(s->cluster_size - 1); + end_offset = start_of_cluster(s, end_offset); if (offset > end_offset) { return 0; @@ -1381,7 +1442,7 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, /* Each L2 table is handled by its own loop iteration */ while (nb_clusters > 0) { - ret = discard_single_l2(bs, offset, nb_clusters); + ret = discard_single_l2(bs, offset, nb_clusters, type); if (ret < 0) { goto fail; } @@ -1476,3 +1537,255 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) return ret; } + +/* + * Expands all zero clusters in a specific L1 table (or deallocates them, for + * non-backed non-pre-allocated zero clusters). + * + * expanded_clusters is a bitmap where every bit corresponds to one cluster in + * the image file; a bit gets set if the corresponding cluster has been used for + * zero expansion (i.e., has been filled with zeroes and is referenced from an + * L2 table). nb_clusters contains the total cluster count of the image file, + * i.e., the number of bits in expanded_clusters. + */ +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, + int l1_size, uint8_t **expanded_clusters, + uint64_t *nb_clusters) +{ + BDRVQcowState *s = bs->opaque; + bool is_active_l1 = (l1_table == s->l1_table); + uint64_t *l2_table = NULL; + int ret; + int i, j; + + if (!is_active_l1) { + /* inactive L2 tables require a buffer to be stored in when loading + * them from disk */ + l2_table = qemu_blockalign(bs, s->cluster_size); + } + + for (i = 0; i < l1_size; i++) { + uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; + bool l2_dirty = false; + + if (!l2_offset) { + /* unallocated */ + continue; + } + + if (is_active_l1) { + /* get active L2 tables from cache */ + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, + (void **)&l2_table); + } else { + /* load inactive L2 tables from disk */ + ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); + } + if (ret < 0) { + goto fail; + } + + for (j = 0; j < s->l2_size; j++) { + uint64_t l2_entry = be64_to_cpu(l2_table[j]); + int64_t offset = l2_entry & L2E_OFFSET_MASK, cluster_index; + int cluster_type = qcow2_get_cluster_type(l2_entry); + bool preallocated = offset != 0; + + if (cluster_type == QCOW2_CLUSTER_NORMAL) { + cluster_index = offset >> s->cluster_bits; + assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); + if ((*expanded_clusters)[cluster_index / 8] & + (1 << (cluster_index % 8))) { + /* Probably a shared L2 table; this cluster was a zero + * cluster which has been expanded, its refcount + * therefore most likely requires an update. */ + ret = qcow2_update_cluster_refcount(bs, cluster_index, 1, + QCOW2_DISCARD_NEVER); + if (ret < 0) { + goto fail; + } + /* Since we just increased the refcount, the COPIED flag may + * no longer be set. */ + l2_table[j] = cpu_to_be64(l2_entry & ~QCOW_OFLAG_COPIED); + l2_dirty = true; + } + continue; + } + else if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_ZERO) { + continue; + } + + if (!preallocated) { + if (!bs->backing_hd) { + /* not backed; therefore we can simply deallocate the + * cluster */ + l2_table[j] = 0; + l2_dirty = true; + continue; + } + + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + ret = offset; + goto fail; + } + } + + ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); + if (ret < 0) { + if (!preallocated) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_ALWAYS); + } + goto fail; + } + + ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE, + s->cluster_sectors, 0); + if (ret < 0) { + if (!preallocated) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_ALWAYS); + } + goto fail; + } + + l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); + l2_dirty = true; + + cluster_index = offset >> s->cluster_bits; + + if (cluster_index >= *nb_clusters) { + uint64_t old_bitmap_size = (*nb_clusters + 7) / 8; + uint64_t new_bitmap_size; + /* The offset may lie beyond the old end of the underlying image + * file for growable files only */ + assert(bs->file->growable); + *nb_clusters = size_to_clusters(s, bs->file->total_sectors * + BDRV_SECTOR_SIZE); + new_bitmap_size = (*nb_clusters + 7) / 8; + *expanded_clusters = g_realloc(*expanded_clusters, + new_bitmap_size); + /* clear the newly allocated space */ + memset(&(*expanded_clusters)[old_bitmap_size], 0, + new_bitmap_size - old_bitmap_size); + } + + assert((cluster_index >= 0) && (cluster_index < *nb_clusters)); + (*expanded_clusters)[cluster_index / 8] |= 1 << (cluster_index % 8); + } + + if (is_active_l1) { + if (l2_dirty) { + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + qcow2_cache_depends_on_flush(s->l2_table_cache); + } + ret = qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); + if (ret < 0) { + l2_table = NULL; + goto fail; + } + } else { + if (l2_dirty) { + ret = qcow2_pre_write_overlap_check(bs, + QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, + s->cluster_size); + if (ret < 0) { + goto fail; + } + + ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } + } + } + + ret = 0; + +fail: + if (l2_table) { + if (!is_active_l1) { + qemu_vfree(l2_table); + } else { + if (ret < 0) { + qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); + } else { + ret = qcow2_cache_put(bs, s->l2_table_cache, + (void **)&l2_table); + } + } + } + return ret; +} + +/* + * For backed images, expands all zero clusters on the image. For non-backed + * images, deallocates all non-pre-allocated zero clusters (and claims the + * allocation for pre-allocated ones). This is important for downgrading to a + * qcow2 version which doesn't yet support metadata zero clusters. + */ +int qcow2_expand_zero_clusters(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l1_table = NULL; + uint64_t nb_clusters; + uint8_t *expanded_clusters; + int ret; + int i, j; + + nb_clusters = size_to_clusters(s, bs->file->total_sectors * + BDRV_SECTOR_SIZE); + expanded_clusters = g_malloc0((nb_clusters + 7) / 8); + + ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, + &expanded_clusters, &nb_clusters); + if (ret < 0) { + goto fail; + } + + /* Inactive L1 tables may point to active L2 tables - therefore it is + * necessary to flush the L2 table cache before trying to access the L2 + * tables pointed to by inactive L1 entries (else we might try to expand + * zero clusters that have already been expanded); furthermore, it is also + * necessary to empty the L2 table cache, since it may contain tables which + * are now going to be modified directly on disk, bypassing the cache. + * qcow2_cache_empty() does both for us. */ + ret = qcow2_cache_empty(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + for (i = 0; i < s->nb_snapshots; i++) { + int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + + BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; + + l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); + + ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset / + BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors); + if (ret < 0) { + goto fail; + } + + for (j = 0; j < s->snapshots[i].l1_size; j++) { + be64_to_cpus(&l1_table[j]); + } + + ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, + &expanded_clusters, &nb_clusters); + if (ret < 0) { + goto fail; + } + } + + ret = 0; + +fail: + g_free(expanded_clusters); + g_free(l1_table); + return ret; +} diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c index 1244693f39..9507aef847 100644 --- a/block/qcow2-refcount.c +++ b/block/qcow2-refcount.c @@ -25,8 +25,10 @@ #include "qemu-common.h" #include "block/block_int.h" #include "block/qcow2.h" +#include "qemu/range.h" +#include "qapi/qmp/types.h" -static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size); +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int64_t offset, int64_t length, int addend, enum qcow2_discard_type type); @@ -38,8 +40,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, int qcow2_refcount_init(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; - int ret, refcount_table_size2, i; + unsigned int refcount_table_size2, i; + int ret; + assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); s->refcount_table = g_malloc(refcount_table_size2); if (s->refcount_table_size > 0) { @@ -85,7 +89,7 @@ static int load_refcount_block(BlockDriverState *bs, static int get_refcount(BlockDriverState *bs, int64_t cluster_index) { BDRVQcowState *s = bs->opaque; - int refcount_table_index, block_index; + uint64_t refcount_table_index, block_index; int64_t refcount_block_offset; int ret; uint16_t *refcount_block; @@ -94,7 +98,8 @@ static int get_refcount(BlockDriverState *bs, int64_t cluster_index) refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT); if (refcount_table_index >= s->refcount_table_size) return 0; - refcount_block_offset = s->refcount_table[refcount_table_index]; + refcount_block_offset = + s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; if (!refcount_block_offset) return 0; @@ -189,10 +194,11 @@ static int alloc_refcount_block(BlockDriverState *bs, * they can describe them themselves. * * - We need to consider that at this point we are inside update_refcounts - * and doing the initial refcount increase. This means that some clusters - * have already been allocated by the caller, but their refcount isn't - * accurate yet. free_cluster_index tells us where this allocation ends - * as long as we don't overwrite it by freeing clusters. + * and potentially doing an initial refcount increase. This means that + * some clusters have already been allocated by the caller, but their + * refcount isn't accurate yet. If we allocate clusters for metadata, we + * need to return -EAGAIN to signal the caller that it needs to restart + * the search for free clusters. * * - alloc_clusters_noref and qcow2_free_clusters may load a different * refcount block into the cache @@ -277,7 +283,10 @@ static int alloc_refcount_block(BlockDriverState *bs, } s->refcount_table[refcount_table_index] = new_block; - return 0; + + /* The new refcount block may be where the caller intended to put its + * data, so let it restart the search. */ + return -EAGAIN; } ret = qcow2_cache_put(bs, s->refcount_block_cache, (void**) refcount_block); @@ -300,8 +309,11 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Calculate the number of refcount blocks needed so far */ uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT); - uint64_t blocks_used = (s->free_cluster_index + - refcount_block_clusters - 1) / refcount_block_clusters; + uint64_t blocks_used = DIV_ROUND_UP(cluster_index, refcount_block_clusters); + + if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { + return -EFBIG; + } /* And now we need at least one block more for the new metadata */ uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); @@ -334,8 +346,6 @@ static int alloc_refcount_block(BlockDriverState *bs, uint16_t *new_blocks = g_malloc0(blocks_clusters * s->cluster_size); uint64_t *new_table = g_malloc0(table_size * sizeof(uint64_t)); - assert(meta_offset >= (s->free_cluster_index * s->cluster_size)); - /* Fill the new refcount table */ memcpy(new_table, s->refcount_table, s->refcount_table_size * sizeof(uint64_t)); @@ -398,18 +408,19 @@ static int alloc_refcount_block(BlockDriverState *bs, s->refcount_table_size = table_size; s->refcount_table_offset = table_offset; - /* Free old table. Remember, we must not change free_cluster_index */ - uint64_t old_free_cluster_index = s->free_cluster_index; + /* Free old table. */ qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), QCOW2_DISCARD_OTHER); - s->free_cluster_index = old_free_cluster_index; ret = load_refcount_block(bs, new_block, (void**) refcount_block); if (ret < 0) { return ret; } - return 0; + /* If we were trying to do the initial refcount update for some cluster + * allocation, we might have used the same clusters to store newly + * allocated metadata. Make the caller search some new space. */ + return -EAGAIN; fail_table: g_free(new_table); @@ -513,8 +524,8 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, s->l2_table_cache); } - start = offset & ~(s->cluster_size - 1); - last = (offset + length - 1) & ~(s->cluster_size - 1); + start = start_of_cluster(s, offset); + last = start_of_cluster(s, offset + length - 1); for(cluster_offset = start; cluster_offset <= last; cluster_offset += s->cluster_size) { @@ -599,10 +610,10 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, * If the return value is non-negative, it is the new refcount of the cluster. * If it is negative, it is -errno and indicates an error. */ -static int update_cluster_refcount(BlockDriverState *bs, - int64_t cluster_index, - int addend, - enum qcow2_discard_type type) +int qcow2_update_cluster_refcount(BlockDriverState *bs, + int64_t cluster_index, + int addend, + enum qcow2_discard_type type) { BDRVQcowState *s = bs->opaque; int ret; @@ -624,15 +635,16 @@ static int update_cluster_refcount(BlockDriverState *bs, /* return < 0 if error */ -static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) { BDRVQcowState *s = bs->opaque; - int i, nb_clusters, refcount; + uint64_t i, nb_clusters; + int refcount; nb_clusters = size_to_clusters(s, size); retry: for(i = 0; i < nb_clusters; i++) { - int64_t next_cluster_index = s->free_cluster_index++; + uint64_t next_cluster_index = s->free_cluster_index++; refcount = get_refcount(bs, next_cluster_index); if (refcount < 0) { @@ -641,6 +653,15 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) goto retry; } } + + /* Make sure that all offsets in the "allocated" range are representable + * in an int64_t */ + if (s->free_cluster_index > 0 && + s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) + { + return -EFBIG; + } + #ifdef DEBUG_ALLOC2 fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", size, @@ -649,18 +670,21 @@ static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size) return (s->free_cluster_index - nb_clusters) << s->cluster_bits; } -int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size) +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) { int64_t offset; int ret; BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); - offset = alloc_clusters_noref(bs, size); - if (offset < 0) { - return offset; - } + do { + offset = alloc_clusters_noref(bs, size); + if (offset < 0) { + return offset; + } + + ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); + } while (ret == -EAGAIN); - ret = update_refcount(bs, offset, size, 1, QCOW2_DISCARD_NEVER); if (ret < 0) { return ret; } @@ -673,33 +697,36 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, { BDRVQcowState *s = bs->opaque; uint64_t cluster_index; - uint64_t old_free_cluster_index; - int i, refcount, ret; + uint64_t i; + int refcount, ret; - /* Check how many clusters there are free */ - cluster_index = offset >> s->cluster_bits; - for(i = 0; i < nb_clusters; i++) { - refcount = get_refcount(bs, cluster_index++); + assert(nb_clusters >= 0); + if (nb_clusters == 0) { + return 0; + } - if (refcount < 0) { - return refcount; - } else if (refcount != 0) { - break; + do { + /* Check how many clusters there are free */ + cluster_index = offset >> s->cluster_bits; + for(i = 0; i < nb_clusters; i++) { + refcount = get_refcount(bs, cluster_index++); + + if (refcount < 0) { + return refcount; + } else if (refcount != 0) { + break; + } } - } - /* And then allocate them */ - old_free_cluster_index = s->free_cluster_index; - s->free_cluster_index = cluster_index + i; + /* And then allocate them */ + ret = update_refcount(bs, offset, i << s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); + } while (ret == -EAGAIN); - ret = update_refcount(bs, offset, i << s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); if (ret < 0) { return ret; } - s->free_cluster_index = old_free_cluster_index; - return i; } @@ -722,7 +749,7 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } redo: free_in_cluster = s->cluster_size - - (s->free_byte_offset & (s->cluster_size - 1)); + offset_into_cluster(s, s->free_byte_offset); if (size <= free_in_cluster) { /* enough space in current cluster */ offset = s->free_byte_offset; @@ -730,20 +757,20 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) free_in_cluster -= size; if (free_in_cluster == 0) s->free_byte_offset = 0; - if ((offset & (s->cluster_size - 1)) != 0) - update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); + if (offset_into_cluster(s, offset) != 0) + qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); } else { offset = qcow2_alloc_clusters(bs, s->cluster_size); if (offset < 0) { return offset; } - cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1); + cluster_offset = start_of_cluster(s, s->free_byte_offset); if ((cluster_offset + s->cluster_size) == offset) { /* we are lucky: contiguous data */ offset = s->free_byte_offset; - update_cluster_refcount(bs, offset >> s->cluster_bits, 1, - QCOW2_DISCARD_NEVER); + qcow2_update_cluster_refcount(bs, offset >> s->cluster_bits, 1, + QCOW2_DISCARD_NEVER); s->free_byte_offset += size; } else { s->free_byte_offset = offset; @@ -752,8 +779,8 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) } /* The cluster refcount was incremented, either by qcow2_alloc_clusters() - * or explicitly by update_cluster_refcount(). Refcount blocks must be - * flushed before the caller's L2 table updates. + * or explicitly by qcow2_update_cluster_refcount(). Refcount blocks must + * be flushed before the caller's L2 table updates. */ qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); return offset; @@ -794,11 +821,13 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, } break; case QCOW2_CLUSTER_NORMAL: - qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, - nb_clusters << s->cluster_bits, type); + case QCOW2_CLUSTER_ZERO: + if (l2_entry & L2E_OFFSET_MASK) { + qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, + nb_clusters << s->cluster_bits, type); + } break; case QCOW2_CLUSTER_UNALLOCATED: - case QCOW2_CLUSTER_ZERO: break; default: abort(); @@ -861,15 +890,17 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } for(j = 0; j < s->l2_size; j++) { + uint64_t cluster_index; + offset = be64_to_cpu(l2_table[j]); - if (offset != 0) { - old_offset = offset; - offset &= ~QCOW_OFLAG_COPIED; - if (offset & QCOW_OFLAG_COMPRESSED) { + old_offset = offset; + offset &= ~QCOW_OFLAG_COPIED; + + switch (qcow2_get_cluster_type(offset)) { + case QCOW2_CLUSTER_COMPRESSED: nb_csectors = ((offset >> s->csize_shift) & s->csize_mask) + 1; if (addend != 0) { - int ret; ret = update_refcount(bs, (offset & s->cluster_offset_mask) & ~511, nb_csectors * 512, addend, @@ -880,11 +911,20 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } /* compressed clusters are never modified */ refcount = 2; - } else { - uint64_t cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; + break; + + case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_ZERO: + cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; + if (!cluster_index) { + /* unallocated */ + refcount = 0; + break; + } if (addend != 0) { - refcount = update_cluster_refcount(bs, cluster_index, addend, - QCOW2_DISCARD_SNAPSHOT); + refcount = qcow2_update_cluster_refcount(bs, + cluster_index, addend, + QCOW2_DISCARD_SNAPSHOT); } else { refcount = get_refcount(bs, cluster_index); } @@ -893,19 +933,26 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, ret = refcount; goto fail; } - } + break; - if (refcount == 1) { - offset |= QCOW_OFLAG_COPIED; - } - if (offset != old_offset) { - if (addend > 0) { - qcow2_cache_set_dependency(bs, s->l2_table_cache, - s->refcount_block_cache); - } - l2_table[j] = cpu_to_be64(offset); - qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); + case QCOW2_CLUSTER_UNALLOCATED: + refcount = 0; + break; + + default: + abort(); + } + + if (refcount == 1) { + offset |= QCOW_OFLAG_COPIED; + } + if (offset != old_offset) { + if (addend > 0) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); } + l2_table[j] = cpu_to_be64(offset); + qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_table); } } @@ -916,8 +963,8 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, if (addend != 0) { - refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend, - QCOW2_DISCARD_SNAPSHOT); + refcount = qcow2_update_cluster_refcount(bs, l2_offset >> + s->cluster_bits, addend, QCOW2_DISCARD_SNAPSHOT); } else { refcount = get_refcount(bs, l2_offset >> s->cluster_bits); } @@ -982,22 +1029,17 @@ static void inc_refcounts(BlockDriverState *bs, int64_t offset, int64_t size) { BDRVQcowState *s = bs->opaque; - int64_t start, last, cluster_offset; - int k; + uint64_t start, last, cluster_offset, k; if (size <= 0) return; - start = offset & ~(s->cluster_size - 1); - last = (offset + size - 1) & ~(s->cluster_size - 1); + start = start_of_cluster(s, offset); + last = start_of_cluster(s, offset + size - 1); for(cluster_offset = start; cluster_offset <= last; cluster_offset += s->cluster_size) { k = cluster_offset >> s->cluster_bits; - if (k < 0) { - fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n", - cluster_offset); - res->corruptions++; - } else if (k >= refcount_table_size) { + if (k >= refcount_table_size) { fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after " "the end of the image file, can't properly check refcounts.\n", cluster_offset); @@ -1014,7 +1056,6 @@ static void inc_refcounts(BlockDriverState *bs, /* Flags for check_refcounts_l1() and check_refcounts_l2() */ enum { - CHECK_OFLAG_COPIED = 0x1, /* check QCOW_OFLAG_COPIED matches refcount */ CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ }; @@ -1033,7 +1074,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, BDRVQcowState *s = bs->opaque; uint64_t *l2_table, l2_entry; uint64_t next_contiguous_offset = 0; - int i, l2_size, nb_csectors, refcount; + int i, l2_size, nb_csectors; /* Read L2 table from disk */ l2_size = s->l2_size * sizeof(uint64_t); @@ -1085,23 +1126,8 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, case QCOW2_CLUSTER_NORMAL: { - /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ uint64_t offset = l2_entry & L2E_OFFSET_MASK; - if (flags & CHECK_OFLAG_COPIED) { - refcount = get_refcount(bs, offset >> s->cluster_bits); - if (refcount < 0) { - fprintf(stderr, "Can't get refcount for offset %" - PRIx64 ": %s\n", l2_entry, strerror(-refcount)); - goto fail; - } - if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "ERROR OFLAG_COPIED: offset=%" - PRIx64 " refcount=%d\n", l2_entry, refcount); - res->corruptions++; - } - } - if (flags & CHECK_FRAG_INFO) { res->bfi.allocated_clusters++; if (next_contiguous_offset && @@ -1116,7 +1142,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, offset, s->cluster_size); /* Correct offsets are cluster aligned */ - if (offset & (s->cluster_size - 1)) { + if (offset_into_cluster(s, offset)) { fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " "properly aligned; L2 entry corrupted.\n", offset); res->corruptions++; @@ -1158,7 +1184,7 @@ static int check_refcounts_l1(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; uint64_t *l1_table, l2_offset, l1_size2; - int i, refcount, ret; + int i, ret; l1_size2 = l1_size * sizeof(uint64_t); @@ -1182,29 +1208,13 @@ static int check_refcounts_l1(BlockDriverState *bs, for(i = 0; i < l1_size; i++) { l2_offset = l1_table[i]; if (l2_offset) { - /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */ - if (flags & CHECK_OFLAG_COPIED) { - refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED) - >> s->cluster_bits); - if (refcount < 0) { - fprintf(stderr, "Can't get refcount for l2_offset %" - PRIx64 ": %s\n", l2_offset, strerror(-refcount)); - goto fail; - } - if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64 - " refcount=%d\n", l2_offset, refcount); - res->corruptions++; - } - } - /* Mark L2 table as used */ l2_offset &= L1E_OFFSET_MASK; inc_refcounts(bs, res, refcount_table, refcount_table_size, l2_offset, s->cluster_size); /* L2 tables are cluster aligned */ - if (l2_offset & (s->cluster_size - 1)) { + if (offset_into_cluster(s, l2_offset)) { fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " "cluster aligned; L1 entry corrupted\n", l2_offset); res->corruptions++; @@ -1228,6 +1238,240 @@ static int check_refcounts_l1(BlockDriverState *bs, return -EIO; } +/* + * Checks the OFLAG_COPIED flag for all L1 and L2 entries. + * + * This function does not print an error message nor does it increment + * check_errors if get_refcount fails (this is because such an error will have + * been already detected and sufficiently signaled by the calling function + * (qcow2_check_refcounts) by the time this function is called). + */ +static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVQcowState *s = bs->opaque; + uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); + int ret; + int refcount; + int i, j; + + for (i = 0; i < s->l1_size; i++) { + uint64_t l1_entry = s->l1_table[i]; + uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; + bool l2_dirty = false; + + if (!l2_offset) { + continue; + } + + refcount = get_refcount(bs, l2_offset >> s->cluster_bits); + if (refcount < 0) { + /* don't print message nor increment check_errors */ + continue; + } + if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " + "l1_entry=%" PRIx64 " refcount=%d\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", + i, l1_entry, refcount); + if (fix & BDRV_FIX_ERRORS) { + s->l1_table[i] = refcount == 1 + ? l1_entry | QCOW_OFLAG_COPIED + : l1_entry & ~QCOW_OFLAG_COPIED; + ret = qcow2_write_l1_entry(bs, i); + if (ret < 0) { + res->check_errors++; + goto fail; + } + res->corruptions_fixed++; + } else { + res->corruptions++; + } + } + + ret = bdrv_pread(bs->file, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not read L2 table: %s\n", + strerror(-ret)); + res->check_errors++; + goto fail; + } + + for (j = 0; j < s->l2_size; j++) { + uint64_t l2_entry = be64_to_cpu(l2_table[j]); + uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; + int cluster_type = qcow2_get_cluster_type(l2_entry); + + if ((cluster_type == QCOW2_CLUSTER_NORMAL) || + ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { + refcount = get_refcount(bs, data_offset >> s->cluster_bits); + if (refcount < 0) { + /* don't print message nor increment check_errors */ + continue; + } + if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "%s OFLAG_COPIED data cluster: " + "l2_entry=%" PRIx64 " refcount=%d\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", + l2_entry, refcount); + if (fix & BDRV_FIX_ERRORS) { + l2_table[j] = cpu_to_be64(refcount == 1 + ? l2_entry | QCOW_OFLAG_COPIED + : l2_entry & ~QCOW_OFLAG_COPIED); + l2_dirty = true; + res->corruptions_fixed++; + } else { + res->corruptions++; + } + } + } + } + + if (l2_dirty) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, + l2_offset, s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not write L2 table; metadata " + "overlap check failed: %s\n", strerror(-ret)); + res->check_errors++; + goto fail; + } + + ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not write L2 table: %s\n", + strerror(-ret)); + res->check_errors++; + goto fail; + } + } + } + + ret = 0; + +fail: + qemu_vfree(l2_table); + return ret; +} + +/* + * Writes one sector of the refcount table to the disk + */ +#define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t)) +static int write_reftable_entry(BlockDriverState *bs, int rt_index) +{ + BDRVQcowState *s = bs->opaque; + uint64_t buf[RT_ENTRIES_PER_SECTOR]; + int rt_start_index; + int i, ret; + + rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1); + for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) { + buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]); + } + + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_TABLE, + s->refcount_table_offset + rt_start_index * sizeof(uint64_t), + sizeof(buf)); + if (ret < 0) { + return ret; + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); + ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset + + rt_start_index * sizeof(uint64_t), buf, sizeof(buf)); + if (ret < 0) { + return ret; + } + + return 0; +} + +/* + * Allocates a new cluster for the given refcount block (represented by its + * offset in the image file) and copies the current content there. This function + * does _not_ decrement the reference count for the currently occupied cluster. + * + * This function prints an informative message to stderr on error (and returns + * -errno); on success, the offset of the newly allocated cluster is returned. + */ +static int64_t realloc_refcount_block(BlockDriverState *bs, int reftable_index, + uint64_t offset) +{ + BDRVQcowState *s = bs->opaque; + int64_t new_offset = 0; + void *refcount_block = NULL; + int ret; + + /* allocate new refcount block */ + new_offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (new_offset < 0) { + fprintf(stderr, "Could not allocate new cluster: %s\n", + strerror(-new_offset)); + ret = new_offset; + goto done; + } + + /* fetch current refcount block content */ + ret = qcow2_cache_get(bs, s->refcount_block_cache, offset, &refcount_block); + if (ret < 0) { + fprintf(stderr, "Could not fetch refcount block: %s\n", strerror(-ret)); + goto fail_free_cluster; + } + + /* new block has not yet been entered into refcount table, therefore it is + * no refcount block yet (regarding this check) */ + ret = qcow2_pre_write_overlap_check(bs, 0, new_offset, s->cluster_size); + if (ret < 0) { + fprintf(stderr, "Could not write refcount block; metadata overlap " + "check failed: %s\n", strerror(-ret)); + /* the image will be marked corrupt, so don't even attempt on freeing + * the cluster */ + goto done; + } + + /* write to new block */ + ret = bdrv_write(bs->file, new_offset / BDRV_SECTOR_SIZE, refcount_block, + s->cluster_sectors); + if (ret < 0) { + fprintf(stderr, "Could not write refcount block: %s\n", strerror(-ret)); + goto fail_free_cluster; + } + + /* update refcount table */ + assert(!offset_into_cluster(s, new_offset)); + s->refcount_table[reftable_index] = new_offset; + ret = write_reftable_entry(bs, reftable_index); + if (ret < 0) { + fprintf(stderr, "Could not update refcount table: %s\n", + strerror(-ret)); + goto fail_free_cluster; + } + + goto done; + +fail_free_cluster: + qcow2_free_clusters(bs, new_offset, s->cluster_size, QCOW2_DISCARD_OTHER); + +done: + if (refcount_block) { + /* This should never fail, as it would only do so if the given refcount + * block cannot be found in the cache. As this is impossible as long as + * there are no bugs, assert the success. */ + int tmp = qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); + assert(tmp == 0); + } + + if (ret < 0) { + return ret; + } + + return new_offset; +} + /* * Checks an image for refcount consistency. * @@ -1238,14 +1482,24 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { BDRVQcowState *s = bs->opaque; - int64_t size, i, highest_cluster; - int nb_clusters, refcount1, refcount2; + int64_t size, i, highest_cluster, nb_clusters; + int refcount1, refcount2; QCowSnapshot *sn; uint16_t *refcount_table; int ret; size = bdrv_getlength(bs->file); + if (size < 0) { + res->check_errors++; + return size; + } + nb_clusters = size_to_clusters(s, size); + if (nb_clusters > INT_MAX) { + res->check_errors++; + return -EFBIG; + } + refcount_table = g_malloc0(nb_clusters * sizeof(uint16_t)); res->bfi.total_clusters = @@ -1257,8 +1511,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, /* current L1 table */ ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, - s->l1_table_offset, s->l1_size, - CHECK_OFLAG_COPIED | CHECK_FRAG_INFO); + s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); if (ret < 0) { goto fail; } @@ -1286,7 +1539,7 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, cluster = offset >> s->cluster_bits; /* Refcount blocks are cluster aligned */ - if (offset & (s->cluster_size - 1)) { + if (offset_into_cluster(s, offset)) { fprintf(stderr, "ERROR refcount block %" PRId64 " is not " "cluster aligned; refcount table entry corrupted\n", i); res->corruptions++; @@ -1304,10 +1557,39 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, inc_refcounts(bs, res, refcount_table, nb_clusters, offset, s->cluster_size); if (refcount_table[cluster] != 1) { - fprintf(stderr, "ERROR refcount block %" PRId64 + fprintf(stderr, "%s refcount block %" PRId64 " refcount=%d\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", i, refcount_table[cluster]); - res->corruptions++; + + if (fix & BDRV_FIX_ERRORS) { + int64_t new_offset; + + new_offset = realloc_refcount_block(bs, i, offset); + if (new_offset < 0) { + res->corruptions++; + continue; + } + + /* update refcounts */ + if ((new_offset >> s->cluster_bits) >= nb_clusters) { + /* increase refcount_table size if necessary */ + int old_nb_clusters = nb_clusters; + nb_clusters = (new_offset >> s->cluster_bits) + 1; + refcount_table = g_realloc(refcount_table, + nb_clusters * sizeof(uint16_t)); + memset(&refcount_table[old_nb_clusters], 0, (nb_clusters + - old_nb_clusters) * sizeof(uint16_t)); + } + refcount_table[cluster]--; + inc_refcounts(bs, res, refcount_table, nb_clusters, + new_offset, s->cluster_size); + + res->corruptions_fixed++; + } else { + res->corruptions++; + } } } } @@ -1363,6 +1645,12 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, } } + /* check OFLAG_COPIED */ + ret = check_oflag_copied(bs, res, fix); + if (ret < 0) { + goto fail; + } + res->image_end_offset = (highest_cluster + 1) * s->cluster_size; ret = 0; @@ -1372,3 +1660,173 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, return ret; } +#define overlaps_with(ofs, sz) \ + ranges_overlap(offset, size, ofs, sz) + +/* + * Checks if the given offset into the image file is actually free to use by + * looking for overlaps with important metadata sections (L1/L2 tables etc.), + * i.e. a sanity check without relying on the refcount tables. + * + * The ign parameter specifies what checks not to perform (being a bitmask of + * QCow2MetadataOverlap values), i.e., what sections to ignore. + * + * Returns: + * - 0 if writing to this offset will not affect the mentioned metadata + * - a positive QCow2MetadataOverlap value indicating one overlapping section + * - a negative value (-errno) indicating an error while performing a check, + * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 + */ +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, + int64_t size) +{ + BDRVQcowState *s = bs->opaque; + int chk = s->overlap_check & ~ign; + int i, j; + + if (!size) { + return 0; + } + + if (chk & QCOW2_OL_MAIN_HEADER) { + if (offset < s->cluster_size) { + return QCOW2_OL_MAIN_HEADER; + } + } + + /* align range to test to cluster boundaries */ + size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); + offset = start_of_cluster(s, offset); + + if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { + if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { + return QCOW2_OL_ACTIVE_L1; + } + } + + if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { + if (overlaps_with(s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t))) { + return QCOW2_OL_REFCOUNT_TABLE; + } + } + + if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { + if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { + return QCOW2_OL_SNAPSHOT_TABLE; + } + } + + if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { + for (i = 0; i < s->nb_snapshots; i++) { + if (s->snapshots[i].l1_size && + overlaps_with(s->snapshots[i].l1_table_offset, + s->snapshots[i].l1_size * sizeof(uint64_t))) { + return QCOW2_OL_INACTIVE_L1; + } + } + } + + if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { + for (i = 0; i < s->l1_size; i++) { + if ((s->l1_table[i] & L1E_OFFSET_MASK) && + overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, + s->cluster_size)) { + return QCOW2_OL_ACTIVE_L2; + } + } + } + + if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { + for (i = 0; i < s->refcount_table_size; i++) { + if ((s->refcount_table[i] & REFT_OFFSET_MASK) && + overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, + s->cluster_size)) { + return QCOW2_OL_REFCOUNT_BLOCK; + } + } + } + + if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { + for (i = 0; i < s->nb_snapshots; i++) { + uint64_t l1_ofs = s->snapshots[i].l1_table_offset; + uint32_t l1_sz = s->snapshots[i].l1_size; + uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); + uint64_t *l1 = g_malloc(l1_sz2); + int ret; + + ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); + if (ret < 0) { + g_free(l1); + return ret; + } + + for (j = 0; j < l1_sz; j++) { + uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; + if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { + g_free(l1); + return QCOW2_OL_INACTIVE_L2; + } + } + + g_free(l1); + } + } + + return 0; +} + +static const char *metadata_ol_names[] = { + [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", + [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", + [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", + [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", + [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", + [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", + [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", + [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", +}; + +/* + * First performs a check for metadata overlaps (through + * qcow2_check_metadata_overlap); if that fails with a negative value (error + * while performing a check), that value is returned. If an impending overlap + * is detected, the BDS will be made unusable, the qcow2 file marked corrupt + * and -EIO returned. + * + * Returns 0 if there were neither overlaps nor errors while checking for + * overlaps; or a negative value (-errno) on error. + */ +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, + int64_t size) +{ + int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); + + if (ret < 0) { + return ret; + } else if (ret > 0) { + int metadata_ol_bitnr = ffs(ret) - 1; + char *message; + QObject *data; + + assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); + + fprintf(stderr, "qcow2: Preventing invalid write on metadata (overlaps " + "with %s); image marked as corrupt.\n", + metadata_ol_names[metadata_ol_bitnr]); + message = g_strdup_printf("Prevented %s overwrite", + metadata_ol_names[metadata_ol_bitnr]); + data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, 'offset': %" + PRId64 ", 'size': %" PRId64 " }", bs->device_name, message, + offset, size); + monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data); + g_free(message); + qobject_decref(data); + + qcow2_mark_corrupt(bs); + bs->drv = NULL; /* make BDS unusable */ + return -EIO; + } + + return 0; +} diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c index 0caac9055f..0aa9defbe2 100644 --- a/block/qcow2-snapshot.c +++ b/block/qcow2-snapshot.c @@ -26,31 +26,6 @@ #include "block/block_int.h" #include "block/qcow2.h" -typedef struct QEMU_PACKED QCowSnapshotHeader { - /* header is 8 byte aligned */ - uint64_t l1_table_offset; - - uint32_t l1_size; - uint16_t id_str_size; - uint16_t name_size; - - uint32_t date_sec; - uint32_t date_nsec; - - uint64_t vm_clock_nsec; - - uint32_t vm_state_size; - uint32_t extra_data_size; /* for extension */ - /* extra data follows */ - /* id_str follows */ - /* name follows */ -} QCowSnapshotHeader; - -typedef struct QEMU_PACKED QCowSnapshotExtraData { - uint64_t vm_state_size_large; - uint64_t disk_size; -} QCowSnapshotExtraData; - void qcow2_free_snapshots(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; @@ -141,8 +116,14 @@ int qcow2_read_snapshots(BlockDriverState *bs) } offset += name_size; sn->name[name_size] = '\0'; + + if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) { + ret = -EFBIG; + goto fail; + } } + assert(offset - s->snapshots_offset <= INT_MAX); s->snapshots_size = offset - s->snapshots_offset; return 0; @@ -163,7 +144,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) uint32_t nb_snapshots; uint64_t snapshots_offset; } QEMU_PACKED header_data; - int64_t offset, snapshots_offset; + int64_t offset, snapshots_offset = 0; int ret; /* compute the size of the snapshots */ @@ -175,20 +156,36 @@ static int qcow2_write_snapshots(BlockDriverState *bs) offset += sizeof(extra); offset += strlen(sn->id_str); offset += strlen(sn->name); + + if (offset > QCOW_MAX_SNAPSHOTS_SIZE) { + ret = -EFBIG; + goto fail; + } } + + assert(offset <= INT_MAX); snapshots_size = offset; /* Allocate space for the new snapshot list */ snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); offset = snapshots_offset; if (offset < 0) { - return offset; + ret = offset; + goto fail; } ret = bdrv_flush(bs); if (ret < 0) { - return ret; + goto fail; } + /* The snapshot list position has not yet been updated, so these clusters + * must indeed be completely free */ + ret = qcow2_pre_write_overlap_check(bs, 0, offset, snapshots_size); + if (ret < 0) { + goto fail; + } + + /* Write all snapshots to the new list */ for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; @@ -211,6 +208,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) id_str_size = strlen(sn->id_str); name_size = strlen(sn->name); + assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX); h.id_str_size = cpu_to_be16(id_str_size); h.name_size = cpu_to_be16(name_size); offset = align_offset(offset, 8); @@ -269,6 +267,10 @@ static int qcow2_write_snapshots(BlockDriverState *bs) return 0; fail: + if (snapshots_offset > 0) { + qcow2_free_clusters(bs, snapshots_offset, snapshots_size, + QCOW2_DISCARD_ALWAYS); + } return ret; } @@ -277,7 +279,8 @@ static void find_new_snapshot_id(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; QCowSnapshot *sn; - int i, id, id_max = 0; + int i; + unsigned long id, id_max = 0; for(i = 0; i < s->nb_snapshots; i++) { sn = s->snapshots + i; @@ -285,34 +288,50 @@ static void find_new_snapshot_id(BlockDriverState *bs, if (id > id_max) id_max = id; } - snprintf(id_str, id_str_size, "%d", id_max + 1); + snprintf(id_str, id_str_size, "%lu", id_max + 1); } -static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str) +static int find_snapshot_by_id_and_name(BlockDriverState *bs, + const char *id, + const char *name) { BDRVQcowState *s = bs->opaque; int i; - for(i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].id_str, id_str)) - return i; + if (id && name) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id) && + !strcmp(s->snapshots[i].name, name)) { + return i; + } + } + } else if (id) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id)) { + return i; + } + } + } else if (name) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].name, name)) { + return i; + } + } } + return -1; } -static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name) +static int find_snapshot_by_id_or_name(BlockDriverState *bs, + const char *id_or_name) { - BDRVQcowState *s = bs->opaque; - int i, ret; + int ret; - ret = find_snapshot_by_id(bs, name); - if (ret >= 0) + ret = find_snapshot_by_id_and_name(bs, id_or_name, NULL); + if (ret >= 0) { return ret; - for(i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].name, name)) - return i; } - return -1; + return find_snapshot_by_id_and_name(bs, NULL, id_or_name); } /* if no id is provided, a new one is constructed */ @@ -326,6 +345,10 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) uint64_t *l1_table = NULL; int64_t l1_table_offset; + if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) { + return -EFBIG; + } + memset(sn, 0, sizeof(*sn)); /* Generate an ID if it wasn't passed */ @@ -334,7 +357,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) } /* Check that the ID is unique */ - if (find_snapshot_by_id(bs, sn_info->id_str) >= 0) { + if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) { return -EEXIST; } @@ -363,6 +386,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) l1_table[i] = cpu_to_be64(s->l1_table[i]); } + ret = qcow2_pre_write_overlap_check(bs, 0, sn->l1_table_offset, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { @@ -396,11 +425,19 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) if (ret < 0) { g_free(s->snapshots); s->snapshots = old_snapshot_list; + s->nb_snapshots--; goto fail; } g_free(old_snapshot_list); + /* The VM state isn't needed any more in the active L1 table; in fact, it + * hurts by causing expensive COW for the next snapshot. */ + qcow2_discard_clusters(bs, qcow2_vm_state_offset(s), + align_offset(sn->vm_state_size, s->cluster_size) + >> BDRV_SECTOR_BITS, + QCOW2_DISCARD_NEVER); + #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -475,6 +512,12 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; } + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, + s->l1_table_offset, cur_l1_bytes); + if (ret < 0) { + goto fail; + } + ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, cur_l1_bytes); if (ret < 0) { @@ -531,15 +574,19 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) return ret; } -int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +int qcow2_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { BDRVQcowState *s = bs->opaque; QCowSnapshot sn; int snapshot_index, ret; /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); if (snapshot_index < 0) { + error_setg(errp, "Can't find the snapshot"); return -ENOENT; } sn = s->snapshots[snapshot_index]; @@ -551,6 +598,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) s->nb_snapshots--; ret = qcow2_write_snapshots(bs); if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to remove snapshot from snapshot list"); return ret; } @@ -568,6 +617,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, sn.l1_size, -1); if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to free the cluster and L1 table"); return ret; } qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), @@ -576,6 +626,8 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) /* must update the copied flag on the current cluster offsets */ ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to update snapshot status in disk"); return ret; } @@ -617,7 +669,10 @@ int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) return s->nb_snapshots; } -int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) +int qcow2_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { int i, snapshot_index; BDRVQcowState *s = bs->opaque; @@ -629,18 +684,25 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name) assert(bs->read_only); /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_name); + snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); if (snapshot_index < 0) { + error_setg(errp, + "Can't find snapshot"); return -ENOENT; } sn = &s->snapshots[snapshot_index]; /* Allocate and read in the snapshot's L1 table */ - new_l1_bytes = s->l1_size * sizeof(uint64_t); + if (sn->l1_size > QCOW_MAX_L1_SIZE) { + error_setg(errp, "Snapshot L1 table too large"); + return -EFBIG; + } + new_l1_bytes = sn->l1_size * sizeof(uint64_t); new_l1_table = g_malloc0(align_offset(new_l1_bytes, 512)); ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); if (ret < 0) { + error_setg(errp, "Failed to read l1 table for snapshot"); g_free(new_l1_table); return ret; } diff --git a/block/qcow2.c b/block/qcow2.c index 3376901bd7..a4b97e8263 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -52,7 +52,7 @@ typedef struct { uint32_t magic; uint32_t len; -} QCowExtension; +} QEMU_PACKED QCowExtension; #define QCOW2_EXT_MAGIC_END 0 #define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA @@ -79,7 +79,8 @@ static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) * return 0 upon success, non-0 otherwise */ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, - uint64_t end_offset, void **p_feature_table) + uint64_t end_offset, void **p_feature_table, + Error **errp) { BDRVQcowState *s = bs->opaque; QCowExtension ext; @@ -100,10 +101,10 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("attempting to read extended header in offset %lu\n", offset); #endif - if (bdrv_pread(bs->file, offset, &ext, sizeof(ext)) != sizeof(ext)) { - fprintf(stderr, "qcow2_read_extension: ERROR: " - "pread fail from offset %" PRIu64 "\n", - offset); + ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext)); + if (ret < 0) { + error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " + "pread fail from offset %" PRIu64, offset); return 1; } be32_to_cpus(&ext.magic); @@ -113,7 +114,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("ext.magic = 0x%x\n", ext.magic); #endif if (ext.len > end_offset - offset) { - error_report("Header extension too large"); + error_setg(errp, "Header extension too large"); return -EINVAL; } @@ -123,14 +124,17 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, case QCOW2_EXT_MAGIC_BACKING_FORMAT: if (ext.len >= sizeof(bs->backing_format)) { - fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" - " (>=%zu)\n", - ext.len, sizeof(bs->backing_format)); + error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32 + " too large (>=%zu)", ext.len, + sizeof(bs->backing_format)); return 2; } - if (bdrv_pread(bs->file, offset , bs->backing_format, - ext.len) != ext.len) + ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); + if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " + "Could not read format name"); return 3; + } bs->backing_format[ext.len] = '\0'; #ifdef DEBUG_EXT printf("Qcow2: Got format extension %s\n", bs->backing_format); @@ -142,6 +146,8 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); ret = bdrv_pread(bs->file, offset , feature_table, ext.len); if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " + "Could not read table"); return ret; } @@ -161,6 +167,8 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, ret = bdrv_pread(bs->file, offset , uext->data, uext->len); if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: unknown extension: " + "Could not read data"); return ret; } } @@ -184,8 +192,8 @@ static void cleanup_unknown_header_ext(BlockDriverState *bs) } } -static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, - const char *fmt, ...) +static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, + Error **errp, const char *fmt, ...) { char msg[64]; va_list ap; @@ -194,17 +202,17 @@ static void GCC_FMT_ATTR(2, 3) report_unsupported(BlockDriverState *bs, vsnprintf(msg, sizeof(msg), fmt, ap); va_end(ap); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "qcow2", msg); + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "qcow2", + msg); } static void report_unsupported_feature(BlockDriverState *bs, - Qcow2Feature *table, uint64_t mask) + Error **errp, Qcow2Feature *table, uint64_t mask) { while (table && table->name[0] != '\0') { if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { if (mask & (1 << table->bit)) { - report_unsupported(bs, "%.46s",table->name); + report_unsupported(bs, errp, "%.46s", table->name); mask &= ~(1 << table->bit); } } @@ -212,7 +220,8 @@ static void report_unsupported_feature(BlockDriverState *bs, } if (mask) { - report_unsupported(bs, "Unknown incompatible feature: %" PRIx64, mask); + report_unsupported(bs, errp, "Unknown incompatible feature: %" PRIx64, + mask); } } @@ -261,12 +270,46 @@ static int qcow2_mark_clean(BlockDriverState *bs) BDRVQcowState *s = bs->opaque; if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + int ret; + + s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } + + return qcow2_update_header(bs); + } + return 0; +} + +/* + * Marks the image as corrupt. + */ +int qcow2_mark_corrupt(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; + return qcow2_update_header(bs); +} + +/* + * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes + * before if necessary. + */ +int qcow2_mark_consistent(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { int ret = bdrv_flush(bs); if (ret < 0) { return ret; } - s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; return qcow2_update_header(bs); } return 0; @@ -281,11 +324,41 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, } if (fix && result->check_errors == 0 && result->corruptions == 0) { - return qcow2_mark_clean(bs); + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + return qcow2_mark_consistent(bs); } return ret; } +static int validate_table_offset(BlockDriverState *bs, uint64_t offset, + uint64_t entries, size_t entry_len) +{ + BDRVQcowState *s = bs->opaque; + uint64_t size; + + /* Use signed INT64_MAX as the maximum even for uint64_t header fields, + * because values will be passed to qemu functions taking int64_t. */ + if (entries > INT64_MAX / entry_len) { + return -EINVAL; + } + + size = entries * entry_len; + + if (INT64_MAX - size < offset) { + return -EINVAL; + } + + /* Tables must be cluster aligned */ + if (offset & (s->cluster_size - 1)) { + return -EINVAL; + } + + return 0; +} + static QemuOptsList qcow2_runtime_opts = { .name = "qcow2", .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), @@ -311,22 +384,84 @@ static QemuOptsList qcow2_runtime_opts = { .type = QEMU_OPT_BOOL, .help = "Generate discard requests when other clusters are freed", }, + { + .name = QCOW2_OPT_OVERLAP, + .type = QEMU_OPT_STRING, + .help = "Selects which overlap checks to perform from a range of " + "templates (none, constant, cached, all)", + }, + { + .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the main qcow2 header", + }, + { + .name = QCOW2_OPT_OVERLAP_ACTIVE_L1, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the active L1 table", + }, + { + .name = QCOW2_OPT_OVERLAP_ACTIVE_L2, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an active L2 table", + }, + { + .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the refcount table", + }, + { + .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into a refcount block", + }, + { + .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the snapshot table", + }, + { + .name = QCOW2_OPT_OVERLAP_INACTIVE_L1, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an inactive L1 table", + }, + { + .name = QCOW2_OPT_OVERLAP_INACTIVE_L2, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an inactive L2 table", + }, { /* end of list */ } }, }; -static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) +static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { + [QCOW2_OL_MAIN_HEADER_BITNR] = QCOW2_OPT_OVERLAP_MAIN_HEADER, + [QCOW2_OL_ACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L1, + [QCOW2_OL_ACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L2, + [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, + [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, + [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, + [QCOW2_OL_INACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L1, + [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, +}; + +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVQcowState *s = bs->opaque; - int len, i, ret = 0; + unsigned int len, i; + int ret = 0; QCowHeader header; QemuOpts *opts; Error *local_err = NULL; uint64_t ext_end; uint64_t l1_vm_state_index; + const char *opt_overlap_check; + int overlap_check_template = 0; ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read qcow2 header"); goto fail; } be32_to_cpus(&header.magic); @@ -344,17 +479,31 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) be32_to_cpus(&header.nb_snapshots); if (header.magic != QCOW_MAGIC) { - ret = -EMEDIUMTYPE; + error_setg(errp, "Image is not in qcow2 format"); + ret = -EINVAL; goto fail; } if (header.version < 2 || header.version > 3) { - report_unsupported(bs, "QCOW version %d", header.version); + report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version); ret = -ENOTSUP; goto fail; } s->qcow_version = header.version; + /* Initialise cluster size */ + if (header.cluster_bits < MIN_CLUSTER_BITS || + header.cluster_bits > MAX_CLUSTER_BITS) { + error_setg(errp, "Unsupported cluster size: 2^%" PRIu32, + header.cluster_bits); + ret = -EINVAL; + goto fail; + } + + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + /* Initialise version 3 header fields */ if (header.version == 2) { header.incompatible_features = 0; @@ -368,6 +517,18 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) be64_to_cpus(&header.autoclear_features); be32_to_cpus(&header.refcount_order); be32_to_cpus(&header.header_length); + + if (header.header_length < 104) { + error_setg(errp, "qcow2 header too short"); + ret = -EINVAL; + goto fail; + } + } + + if (header.header_length > s->cluster_size) { + error_setg(errp, "qcow2 header exceeds cluster size"); + ret = -EINVAL; + goto fail; } if (header.header_length > sizeof(header)) { @@ -376,10 +537,18 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, s->unknown_header_fields_size); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " + "fields"); goto fail; } } + if (header.backing_file_offset > s->cluster_size) { + error_setg(errp, "Invalid backing file offset"); + ret = -EINVAL; + goto fail; + } + if (header.backing_file_offset) { ext_end = header.backing_file_offset; } else { @@ -394,28 +563,38 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { void *feature_table = NULL; qcow2_read_extensions(bs, header.header_length, ext_end, - &feature_table); - report_unsupported_feature(bs, feature_table, + &feature_table, NULL); + report_unsupported_feature(bs, errp, feature_table, s->incompatible_features & ~QCOW2_INCOMPAT_MASK); ret = -ENOTSUP; + g_free(feature_table); goto fail; } + if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { + /* Corrupt images may not be written to unless they are being repaired + */ + if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { + error_setg(errp, "qcow2: Image is corrupt; cannot be opened " + "read/write"); + ret = -EACCES; + goto fail; + } + } + /* Check support for various header values */ if (header.refcount_order != 4) { - report_unsupported(bs, "%d bit reference counts", + report_unsupported(bs, errp, "%d bit reference counts", 1 << header.refcount_order); ret = -ENOTSUP; goto fail; } + s->refcount_order = header.refcount_order; - if (header.cluster_bits < MIN_CLUSTER_BITS || - header.cluster_bits > MAX_CLUSTER_BITS) { - ret = -EINVAL; - goto fail; - } if (header.crypt_method > QCOW_CRYPT_AES) { + error_setg(errp, "Unsupported encryption method: %" PRIu32, + header.crypt_method); ret = -EINVAL; goto fail; } @@ -423,27 +602,57 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) if (s->crypt_method_header) { bs->encrypted = 1; } - s->cluster_bits = header.cluster_bits; - s->cluster_size = 1 << s->cluster_bits; - s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ s->l2_size = 1 << s->l2_bits; bs->total_sectors = header.size / 512; s->csize_shift = (62 - (s->cluster_bits - 8)); s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; s->cluster_offset_mask = (1LL << s->csize_shift) - 1; + s->refcount_table_offset = header.refcount_table_offset; s->refcount_table_size = header.refcount_table_clusters << (s->cluster_bits - 3); - s->snapshots_offset = header.snapshots_offset; - s->nb_snapshots = header.nb_snapshots; + if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) { + error_setg(errp, "Reference count table too large"); + ret = -EINVAL; + goto fail; + } + + ret = validate_table_offset(bs, s->refcount_table_offset, + s->refcount_table_size, sizeof(uint64_t)); + if (ret < 0) { + error_setg(errp, "Invalid reference count table offset"); + goto fail; + } + + /* Snapshot table offset/length */ + if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) { + error_setg(errp, "Too many snapshots"); + ret = -EINVAL; + goto fail; + } + + ret = validate_table_offset(bs, header.snapshots_offset, + header.nb_snapshots, + sizeof(QCowSnapshotHeader)); + if (ret < 0) { + error_setg(errp, "Invalid snapshot table offset"); + goto fail; + } /* read the level 1 table */ + if (header.l1_size > QCOW_MAX_L1_SIZE) { + error_setg(errp, "Active L1 table too large"); + ret = -EFBIG; + goto fail; + } s->l1_size = header.l1_size; l1_vm_state_index = size_to_l1(s, header.size); if (l1_vm_state_index > INT_MAX) { + error_setg(errp, "Image is too big"); ret = -EFBIG; goto fail; } @@ -452,16 +661,27 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) /* the L1 table must contain at least enough entries to put header.size bytes */ if (s->l1_size < s->l1_vm_state_index) { + error_setg(errp, "L1 table is too small"); ret = -EINVAL; goto fail; } + + ret = validate_table_offset(bs, header.l1_table_offset, + header.l1_size, sizeof(uint64_t)); + if (ret < 0) { + error_setg(errp, "Invalid L1 table offset"); + goto fail; + } s->l1_table_offset = header.l1_table_offset; + + if (s->l1_size > 0) { s->l1_table = g_malloc0( align_offset(s->l1_size * sizeof(uint64_t), 512)); ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read L1 table"); goto fail; } for(i = 0;i < s->l1_size; i++) { @@ -482,6 +702,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) ret = qcow2_refcount_init(bs); if (ret != 0) { + error_setg_errno(errp, -ret, "Could not initialize refcount handling"); goto fail; } @@ -489,7 +710,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) QTAILQ_INIT(&s->discards); /* read qcow2 extensions */ - if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL)) { + if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, + &local_err)) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -497,27 +720,36 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) /* read the backing file name */ if (header.backing_file_offset != 0) { len = header.backing_file_size; - if (len > 1023) { - len = 1023; + if (len > MIN(1023, s->cluster_size - header.backing_file_offset)) { + error_setg(errp, "Backing file name too long"); + ret = -EINVAL; + goto fail; } ret = bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read backing file name"); goto fail; } bs->backing_file[len] = '\0'; } + /* Internal snapshots */ + s->snapshots_offset = header.snapshots_offset; + s->nb_snapshots = header.nb_snapshots; + ret = qcow2_read_snapshots(bs); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read snapshots"); goto fail; } /* Clear unknown autoclear feature bits */ - if (!bs->read_only && s->autoclear_features != 0) { + if (!bs->read_only && !(flags & BDRV_O_INCOMING) && s->autoclear_features) { s->autoclear_features = 0; ret = qcow2_update_header(bs); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not update qcow2 header"); goto fail; } } @@ -526,22 +758,22 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) qemu_co_mutex_init(&s->lock); /* Repair image if dirty */ - if (!(flags & BDRV_O_CHECK) && !bs->read_only && + if (!(flags & (BDRV_O_CHECK | BDRV_O_INCOMING)) && !bs->read_only && (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { BdrvCheckResult result = {0}; ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not repair dirty image"); goto fail; } } /* Enable lazy_refcounts according to image and command line options */ - opts = qemu_opts_create_nofail(&qcow2_runtime_opts); + opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -559,11 +791,38 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) s->discard_passthrough[QCOW2_DISCARD_OTHER] = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + opt_overlap_check = qemu_opt_get(opts, "overlap-check") ?: "cached"; + if (!strcmp(opt_overlap_check, "none")) { + overlap_check_template = 0; + } else if (!strcmp(opt_overlap_check, "constant")) { + overlap_check_template = QCOW2_OL_CONSTANT; + } else if (!strcmp(opt_overlap_check, "cached")) { + overlap_check_template = QCOW2_OL_CACHED; + } else if (!strcmp(opt_overlap_check, "all")) { + overlap_check_template = QCOW2_OL_ALL; + } else { + error_setg(errp, "Unsupported value '%s' for qcow2 option " + "'overlap-check'. Allowed are either of the following: " + "none, constant, cached, all", opt_overlap_check); + qemu_opts_del(opts); + ret = -EINVAL; + goto fail; + } + + s->overlap_check = 0; + for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { + /* overlap-check defines a template bitmask, but every flag may be + * overwritten through the associated boolean option */ + s->overlap_check |= + qemu_opt_get_bool(opts, overlap_bool_option_names[i], + overlap_check_template & (1 << i)) << i; + } + qemu_opts_del(opts); if (s->use_lazy_refcounts && s->qcow_version < 3) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Lazy refcounts require " - "a qcow2 image with at least qemu 1.1 compatibility level"); + error_setg(errp, "Lazy refcounts require a qcow2 image with at least " + "qemu 1.1 compatibility level"); ret = -EINVAL; goto fail; } @@ -582,14 +841,28 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags) qcow2_free_snapshots(bs); qcow2_refcount_close(bs); g_free(s->l1_table); + /* else pre-write overlap checks in cache_destroy may crash */ + s->l1_table = NULL; if (s->l2_table_cache) { qcow2_cache_destroy(bs, s->l2_table_cache); } + if (s->refcount_block_cache) { + qcow2_cache_destroy(bs, s->refcount_block_cache); + } g_free(s->cluster_cache); qemu_vfree(s->cluster_data); return ret; } +static int qcow2_refresh_limits(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->cluster_sectors; + + return 0; +} + static int qcow2_set_key(BlockDriverState *bs, const char *key) { BDRVQcowState *s = bs->opaque; @@ -632,32 +905,56 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key) return 0; } -/* We have nothing to do for QCOW2 reopen, stubs just return - * success */ +/* We have no actual commit/abort logic for qcow2, but we need to write out any + * unwritten data if we reopen read-only. */ static int qcow2_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { + int ret; + + if ((state->flags & BDRV_O_RDWR) == 0) { + ret = bdrv_flush(state->bs); + if (ret < 0) { + return ret; + } + + ret = qcow2_mark_clean(state->bs); + if (ret < 0) { + return ret; + } + } + return 0; } -static int coroutine_fn qcow2_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVQcowState *s = bs->opaque; uint64_t cluster_offset; - int ret; + int index_in_cluster, ret; + int64_t status = 0; *pnum = nb_sectors; - /* FIXME We can get errors here, but the bdrv_co_is_allocated interface - * can't pass them on today */ qemu_co_mutex_lock(&s->lock); ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { - *pnum = 0; + return ret; } - return (cluster_offset != 0) || (ret == QCOW2_CLUSTER_ZERO); + if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && + !s->crypt_method) { + index_in_cluster = sector_num & (s->cluster_sectors - 1); + cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; + } + if (ret == QCOW2_CLUSTER_ZERO) { + status |= BDRV_BLOCK_ZERO; + } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { + status |= BDRV_BLOCK_DATA; + } + return status; } /* handle reading after the end of the backing file */ @@ -821,7 +1118,6 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, { BDRVQcowState *s = bs->opaque; int index_in_cluster; - int n_end; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ uint64_t cluster_offset; @@ -845,14 +1141,16 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, trace_qcow2_writev_start_part(qemu_coroutine_self()); index_in_cluster = sector_num & (s->cluster_sectors - 1); - n_end = index_in_cluster + remaining_sectors; + cur_nr_sectors = remaining_sectors; if (s->crypt_method && - n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) { - n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; + cur_nr_sectors > + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) { + cur_nr_sectors = + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster; } ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, - index_in_cluster, n_end, &cur_nr_sectors, &cluster_offset, &l2meta); + &cur_nr_sectors, &cluster_offset, &l2meta); if (ret < 0) { goto fail; } @@ -881,6 +1179,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, cur_nr_sectors * 512); } + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE, + cur_nr_sectors * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto fail; + } + qemu_co_mutex_unlock(&s->lock); BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), @@ -947,11 +1252,15 @@ static void qcow2_close(BlockDriverState *bs) { BDRVQcowState *s = bs->opaque; g_free(s->l1_table); + /* else pre-write overlap checks in cache_destroy may crash */ + s->l1_table = NULL; - qcow2_cache_flush(bs, s->l2_table_cache); - qcow2_cache_flush(bs, s->refcount_block_cache); + if (!(bs->open_flags & BDRV_O_INCOMING)) { + qcow2_cache_flush(bs, s->l2_table_cache); + qcow2_cache_flush(bs, s->refcount_block_cache); - qcow2_mark_clean(bs); + qcow2_mark_clean(bs); + } qcow2_cache_destroy(bs, s->l2_table_cache); qcow2_cache_destroy(bs, s->refcount_block_cache); @@ -965,7 +1274,7 @@ static void qcow2_close(BlockDriverState *bs) qcow2_free_snapshots(bs); } -static void qcow2_invalidate_cache(BlockDriverState *bs) +static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) { BDRVQcowState *s = bs->opaque; int flags = s->flags; @@ -973,6 +1282,8 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) AES_KEY aes_decrypt_key; uint32_t crypt_method = 0; QDict *options; + Error *local_err = NULL; + int ret; /* * Backing files are read-only which makes all of their metadata immutable, @@ -987,12 +1298,25 @@ static void qcow2_invalidate_cache(BlockDriverState *bs) qcow2_close(bs); - options = qdict_new(); - qdict_put(options, QCOW2_OPT_LAZY_REFCOUNTS, - qbool_from_int(s->use_lazy_refcounts)); + bdrv_invalidate_cache(bs->file, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } memset(s, 0, sizeof(BDRVQcowState)); - qcow2_open(bs, options, flags); + options = qdict_clone_shallow(bs->options); + + ret = qcow2_open(bs, options, flags, &local_err); + if (local_err) { + error_setg(errp, "Could not reopen qcow2 layer: %s", + error_get_pretty(local_err)); + error_free(local_err); + return; + } else if (ret < 0) { + error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); + return; + } QDECREF(options); @@ -1076,7 +1400,7 @@ int qcow2_update_header(BlockDriverState *bs) .incompatible_features = cpu_to_be64(s->incompatible_features), .compatible_features = cpu_to_be64(s->compatible_features), .autoclear_features = cpu_to_be64(s->autoclear_features), - .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), + .refcount_order = cpu_to_be32(s->refcount_order), .header_length = cpu_to_be32(header_length), }; @@ -1129,6 +1453,11 @@ int qcow2_update_header(BlockDriverState *bs) .bit = QCOW2_INCOMPAT_DIRTY_BITNR, .name = "dirty bit", }, + { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, + .name = "corrupt bit", + }, { .type = QCOW2_FEAT_TYPE_COMPATIBLE, .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, @@ -1210,34 +1539,39 @@ static int preallocate(BlockDriverState *bs) int ret; QCowL2Meta *meta; - nb_sectors = bdrv_getlength(bs) >> 9; + nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; offset = 0; while (nb_sectors) { - num = MIN(nb_sectors, INT_MAX >> 9); - ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, + num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS); + ret = qcow2_alloc_cluster_offset(bs, offset, &num, &host_offset, &meta); if (ret < 0) { return ret; } - ret = qcow2_alloc_cluster_link_l2(bs, meta); - if (ret < 0) { - qcow2_free_any_clusters(bs, meta->alloc_offset, meta->nb_clusters, - QCOW2_DISCARD_NEVER); - return ret; - } + while (meta) { + QCowL2Meta *next = meta->next; + + ret = qcow2_alloc_cluster_link_l2(bs, meta); + if (ret < 0) { + qcow2_free_any_clusters(bs, meta->alloc_offset, + meta->nb_clusters, QCOW2_DISCARD_NEVER); + return ret; + } - /* There are no dependent requests, but we need to remove our request - * from the list of in-flight requests */ - if (meta != NULL) { + /* There are no dependent requests, but we need to remove our + * request from the list of in-flight requests */ QLIST_REMOVE(meta, next_in_flight); + + g_free(meta); + meta = next; } /* TODO Preallocate data if requested */ nb_sectors -= num; - offset += num << 9; + offset += num << BDRV_SECTOR_BITS; } /* @@ -1246,9 +1580,10 @@ static int preallocate(BlockDriverState *bs) * EOF). Extend the image to the last allocated sector. */ if (host_offset != 0) { - uint8_t buf[512]; - memset(buf, 0, 512); - ret = bdrv_write(bs->file, (host_offset >> 9) + num - 1, buf, 1); + uint8_t buf[BDRV_SECTOR_SIZE]; + memset(buf, 0, BDRV_SECTOR_SIZE); + ret = bdrv_write(bs->file, (host_offset >> BDRV_SECTOR_BITS) + num - 1, + buf, 1); if (ret < 0) { return ret; } @@ -1260,7 +1595,8 @@ static int preallocate(BlockDriverState *bs) static int qcow2_create2(const char *filename, int64_t total_size, const char *backing_file, const char *backing_format, int flags, size_t cluster_size, int prealloc, - QEMUOptionParameter *options, int version) + QEMUOptionParameter *options, int version, + Error **errp) { /* Calculate cluster_bits */ int cluster_bits; @@ -1268,9 +1604,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || (1 << cluster_bits) != cluster_size) { - error_report( - "Cluster size must be a power of two between %d and %dk", - 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); + error_setg(errp, "Cluster size must be a power of two between %d and " + "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); return -EINVAL; } @@ -1287,59 +1622,72 @@ static int qcow2_create2(const char *filename, int64_t total_size, * size for any qcow2 image. */ BlockDriverState* bs; - QCowHeader header; - uint8_t* refcount_table; + QCowHeader *header; + uint64_t* refcount_table; + Error *local_err = NULL; int ret; - ret = bdrv_create_file(filename, options); + ret = bdrv_create_file(filename, options, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } /* Write the header */ - memset(&header, 0, sizeof(header)); - header.magic = cpu_to_be32(QCOW_MAGIC); - header.version = cpu_to_be32(version); - header.cluster_bits = cpu_to_be32(cluster_bits); - header.size = cpu_to_be64(0); - header.l1_table_offset = cpu_to_be64(0); - header.l1_size = cpu_to_be32(0); - header.refcount_table_offset = cpu_to_be64(cluster_size); - header.refcount_table_clusters = cpu_to_be32(1); - header.refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT); - header.header_length = cpu_to_be32(sizeof(header)); + QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); + header = g_malloc0(cluster_size); + *header = (QCowHeader) { + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(version), + .cluster_bits = cpu_to_be32(cluster_bits), + .size = cpu_to_be64(0), + .l1_table_offset = cpu_to_be64(0), + .l1_size = cpu_to_be32(0), + .refcount_table_offset = cpu_to_be64(cluster_size), + .refcount_table_clusters = cpu_to_be32(1), + .refcount_order = cpu_to_be32(3 + REFCOUNT_SHIFT), + .header_length = cpu_to_be32(sizeof(*header)), + }; if (flags & BLOCK_FLAG_ENCRYPT) { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES); } else { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); } if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { - header.compatible_features |= + header->compatible_features |= cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); } - ret = bdrv_pwrite(bs, 0, &header, sizeof(header)); + ret = bdrv_pwrite(bs, 0, header, cluster_size); + g_free(header); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write qcow2 header"); goto out; } - /* Write an empty refcount table */ - refcount_table = g_malloc0(cluster_size); - ret = bdrv_pwrite(bs, cluster_size, refcount_table, cluster_size); + /* Write a refcount table with one refcount block */ + refcount_table = g_malloc0(2 * cluster_size); + refcount_table[0] = cpu_to_be64(2 * cluster_size); + ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size); g_free(refcount_table); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write refcount table"); goto out; } - bdrv_close(bs); + bdrv_unref(bs); + bs = NULL; /* * And now open the image and make it consistent first (i.e. increase the @@ -1348,14 +1696,17 @@ static int qcow2_create2(const char *filename, int64_t total_size, */ BlockDriver* drv = bdrv_find_format("qcow2"); assert(drv != NULL); - ret = bdrv_open(bs, filename, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv); + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, drv, &local_err); if (ret < 0) { + error_propagate(errp, local_err); goto out; } - ret = qcow2_alloc_clusters(bs, 2 * cluster_size); + ret = qcow2_alloc_clusters(bs, 3 * cluster_size); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " + "header and refcount table"); goto out; } else if (ret != 0) { @@ -1366,6 +1717,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Okay, now that we have a valid image, let's give it the right size */ ret = bdrv_truncate(bs, total_size * BDRV_SECTOR_SIZE); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not resize image"); goto out; } @@ -1373,6 +1725,8 @@ static int qcow2_create2(const char *filename, int64_t total_size, if (backing_file) { ret = bdrv_change_backing_file(bs, backing_file, backing_format); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not assign backing file '%s' " + "with format '%s'", backing_file, backing_format); goto out; } } @@ -1384,17 +1738,33 @@ static int qcow2_create2(const char *filename, int64_t total_size, ret = preallocate(bs); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { + error_setg_errno(errp, -ret, "Could not preallocate metadata"); goto out; } } + bdrv_unref(bs); + bs = NULL; + + /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, + drv, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto out; + } + ret = 0; out: - bdrv_delete(bs); + if (bs) { + bdrv_unref(bs); + } return ret; } -static int qcow2_create(const char *filename, QEMUOptionParameter *options) +static int qcow2_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { const char *backing_file = NULL; const char *backing_fmt = NULL; @@ -1402,7 +1772,9 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options) int flags = 0; size_t cluster_size = DEFAULT_CLUSTER_SIZE; int prealloc = 0; - int version = 2; + int version = 3; + Error *local_err = NULL; + int ret; /* Read out options */ while (options && options->name) { @@ -1424,18 +1796,20 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options) } else if (!strcmp(options->value.s, "metadata")) { prealloc = 1; } else { - fprintf(stderr, "Invalid preallocation mode: '%s'\n", - options->value.s); + error_setg(errp, "Invalid preallocation mode: '%s'", + options->value.s); return -EINVAL; } } else if (!strcmp(options->name, BLOCK_OPT_COMPAT_LEVEL)) { - if (!options->value.s || !strcmp(options->value.s, "0.10")) { + if (!options->value.s) { + /* keep the default */ + } else if (!strcmp(options->value.s, "0.10")) { version = 2; } else if (!strcmp(options->value.s, "1.1")) { version = 3; } else { - fprintf(stderr, "Invalid compatibility level: '%s'\n", - options->value.s); + error_setg(errp, "Invalid compatibility level: '%s'", + options->value.s); return -EINVAL; } } else if (!strcmp(options->name, BLOCK_OPT_LAZY_REFCOUNTS)) { @@ -1445,43 +1819,27 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options) } if (backing_file && prealloc) { - fprintf(stderr, "Backing file and preallocation cannot be used at " - "the same time\n"); + error_setg(errp, "Backing file and preallocation cannot be used at " + "the same time"); return -EINVAL; } if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { - fprintf(stderr, "Lazy refcounts only supported with compatibility " - "level 1.1 and above (use compat=1.1 or greater)\n"); + error_setg(errp, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)"); return -EINVAL; } - return qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, - cluster_size, prealloc, options, version); -} - -static int qcow2_make_empty(BlockDriverState *bs) -{ -#if 0 - /* XXX: not correct */ - BDRVQcowState *s = bs->opaque; - uint32_t l1_length = s->l1_size * sizeof(uint64_t); - int ret; - - memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite(bs->file, s->l1_table_offset, s->l1_table, l1_length) < 0) - return -1; - ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); - if (ret < 0) - return ret; - - l2_cache_reset(bs); -#endif - return 0; + ret = qcow2_create2(filename, sectors, backing_file, backing_fmt, flags, + cluster_size, prealloc, options, version, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; } static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { int ret; BDRVQcowState *s = bs->opaque; @@ -1508,7 +1866,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, qemu_co_mutex_lock(&s->lock); ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors); + nb_sectors, QCOW2_DISCARD_REQUEST); qemu_co_mutex_unlock(&s->lock); return ret; } @@ -1631,6 +1989,12 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, goto fail; } cluster_offset &= s->cluster_offset_mask; + + ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len); + if (ret < 0) { + goto fail; + } + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); if (ret < 0) { @@ -1668,19 +2032,43 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) return 0; } -static int64_t qcow2_vm_state_offset(BDRVQcowState *s) -{ - return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); -} - static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { BDRVQcowState *s = bs->opaque; + bdi->unallocated_blocks_are_zero = true; + bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3); bdi->cluster_size = s->cluster_size; bdi->vm_state_offset = qcow2_vm_state_offset(s); return 0; } +static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); + + *spec_info = (ImageInfoSpecific){ + .kind = IMAGE_INFO_SPECIFIC_KIND_QCOW2, + { + .qcow2 = g_new(ImageInfoSpecificQCow2, 1), + }, + }; + if (s->qcow_version == 2) { + *spec_info->qcow2 = (ImageInfoSpecificQCow2){ + .compat = g_strdup("0.10"), + }; + } else if (s->qcow_version == 3) { + *spec_info->qcow2 = (ImageInfoSpecificQCow2){ + .compat = g_strdup("1.1"), + .lazy_refcounts = s->compatible_features & + QCOW2_COMPAT_LAZY_REFCOUNTS, + .has_lazy_refcounts = true, + }; + } + + return spec_info; +} + #if 0 static void dump_refcounts(BlockDriverState *bs) { @@ -1706,13 +2094,22 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { BDRVQcowState *s = bs->opaque; + int64_t total_sectors = bs->total_sectors; int growable = bs->growable; + bool zero_beyond_eof = bs->zero_beyond_eof; int ret; BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); bs->growable = 1; + bs->zero_beyond_eof = false; ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); bs->growable = growable; + bs->zero_beyond_eof = zero_beyond_eof; + + /* bdrv_co_do_writev will have increased the total_sectors value to include + * the VM state - the VM state is however not an actual part of the block + * device, therefore, we need to restore the old value. */ + bs->total_sectors = total_sectors; return ret; } @@ -1722,16 +2119,212 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, { BDRVQcowState *s = bs->opaque; int growable = bs->growable; + bool zero_beyond_eof = bs->zero_beyond_eof; int ret; BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); bs->growable = 1; + bs->zero_beyond_eof = false; ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); bs->growable = growable; + bs->zero_beyond_eof = zero_beyond_eof; return ret; } +/* + * Downgrades an image's version. To achieve this, any incompatible features + * have to be removed. + */ +static int qcow2_downgrade(BlockDriverState *bs, int target_version) +{ + BDRVQcowState *s = bs->opaque; + int current_version = s->qcow_version; + int ret; + + if (target_version == current_version) { + return 0; + } else if (target_version > current_version) { + return -EINVAL; + } else if (target_version != 2) { + return -EINVAL; + } + + if (s->refcount_order != 4) { + /* we would have to convert the image to a refcount_order == 4 image + * here; however, since qemu (at the time of writing this) does not + * support anything different than 4 anyway, there is no point in doing + * so right now; however, we should error out (if qemu supports this in + * the future and this code has not been adapted) */ + error_report("qcow2_downgrade: Image refcount orders other than 4 are " + "currently not supported."); + return -ENOTSUP; + } + + /* clear incompatible features */ + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + } + + /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in + * the first place; if that happens nonetheless, returning -ENOTSUP is the + * best thing to do anyway */ + + if (s->incompatible_features) { + return -ENOTSUP; + } + + /* since we can ignore compatible features, we can set them to 0 as well */ + s->compatible_features = 0; + /* if lazy refcounts have been used, they have already been fixed through + * clearing the dirty flag */ + + /* clearing autoclear features is trivial */ + s->autoclear_features = 0; + + ret = qcow2_expand_zero_clusters(bs); + if (ret < 0) { + return ret; + } + + s->qcow_version = target_version; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->qcow_version = current_version; + return ret; + } + return 0; +} + +static int qcow2_amend_options(BlockDriverState *bs, + QEMUOptionParameter *options) +{ + BDRVQcowState *s = bs->opaque; + int old_version = s->qcow_version, new_version = old_version; + uint64_t new_size = 0; + const char *backing_file = NULL, *backing_format = NULL; + bool lazy_refcounts = s->use_lazy_refcounts; + int ret; + int i; + + for (i = 0; options[i].name; i++) + { + if (!options[i].assigned) { + /* only change explicitly defined options */ + continue; + } + + if (!strcmp(options[i].name, "compat")) { + if (!options[i].value.s) { + /* preserve default */ + } else if (!strcmp(options[i].value.s, "0.10")) { + new_version = 2; + } else if (!strcmp(options[i].value.s, "1.1")) { + new_version = 3; + } else { + fprintf(stderr, "Unknown compatibility level %s.\n", + options[i].value.s); + return -EINVAL; + } + } else if (!strcmp(options[i].name, "preallocation")) { + fprintf(stderr, "Cannot change preallocation mode.\n"); + return -ENOTSUP; + } else if (!strcmp(options[i].name, "size")) { + new_size = options[i].value.n; + } else if (!strcmp(options[i].name, "backing_file")) { + backing_file = options[i].value.s; + } else if (!strcmp(options[i].name, "backing_fmt")) { + backing_format = options[i].value.s; + } else if (!strcmp(options[i].name, "encryption")) { + if ((options[i].value.n != !!s->crypt_method)) { + fprintf(stderr, "Changing the encryption flag is not " + "supported.\n"); + return -ENOTSUP; + } + } else if (!strcmp(options[i].name, "cluster_size")) { + if (options[i].value.n != s->cluster_size) { + fprintf(stderr, "Changing the cluster size is not " + "supported.\n"); + return -ENOTSUP; + } + } else if (!strcmp(options[i].name, "lazy_refcounts")) { + lazy_refcounts = options[i].value.n; + } else { + /* if this assertion fails, this probably means a new option was + * added without having it covered here */ + assert(false); + } + } + + if (new_version != old_version) { + if (new_version > old_version) { + /* Upgrade */ + s->qcow_version = new_version; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->qcow_version = old_version; + return ret; + } + } else { + ret = qcow2_downgrade(bs, new_version); + if (ret < 0) { + return ret; + } + } + } + + if (backing_file || backing_format) { + ret = qcow2_change_backing_file(bs, backing_file ?: bs->backing_file, + backing_format ?: bs->backing_format); + if (ret < 0) { + return ret; + } + } + + if (s->use_lazy_refcounts != lazy_refcounts) { + if (lazy_refcounts) { + if (s->qcow_version < 3) { + fprintf(stderr, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)\n"); + return -EINVAL; + } + s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; + return ret; + } + s->use_lazy_refcounts = true; + } else { + /* make image clean first */ + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + /* now disallow lazy refcounts */ + s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; + return ret; + } + s->use_lazy_refcounts = false; + } + } + + if (new_size) { + ret = bdrv_truncate(bs, new_size); + if (ret < 0) { + return ret; + } + } + + return 0; +} + static QEMUOptionParameter qcow2_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1786,9 +2379,8 @@ static BlockDriver bdrv_qcow2 = { .bdrv_reopen_prepare = qcow2_reopen_prepare, .bdrv_create = qcow2_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = qcow2_co_is_allocated, + .bdrv_co_get_block_status = qcow2_co_get_block_status, .bdrv_set_key = qcow2_set_key, - .bdrv_make_empty = qcow2_make_empty, .bdrv_co_readv = qcow2_co_readv, .bdrv_co_writev = qcow2_co_writev, @@ -1805,16 +2397,19 @@ static BlockDriver bdrv_qcow2 = { .bdrv_snapshot_list = qcow2_snapshot_list, .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, .bdrv_get_info = qcow2_get_info, + .bdrv_get_specific_info = qcow2_get_specific_info, .bdrv_save_vmstate = qcow2_save_vmstate, .bdrv_load_vmstate = qcow2_load_vmstate, .bdrv_change_backing_file = qcow2_change_backing_file, + .bdrv_refresh_limits = qcow2_refresh_limits, .bdrv_invalidate_cache = qcow2_invalidate_cache, .create_options = qcow2_create_options, .bdrv_check = qcow2_check, + .bdrv_amend_options = qcow2_amend_options, }; static void bdrv_qcow2_init(void) diff --git a/block/qcow2.h b/block/qcow2.h index dba9771419..b49424b85e 100644 --- a/block/qcow2.h +++ b/block/qcow2.h @@ -38,13 +38,26 @@ #define QCOW_CRYPT_AES 1 #define QCOW_MAX_CRYPT_CLUSTERS 32 +#define QCOW_MAX_SNAPSHOTS 65536 + +/* 8 MB refcount table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_REFTABLE_SIZE 0x800000 + +/* 32 MB L1 table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_L1_SIZE 0x2000000 + +/* Allow for an average of 1k per snapshot table entry, should be plenty of + * space for snapshot names and IDs */ +#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) /* indicate that the refcount of the referenced cluster is exactly one. */ -#define QCOW_OFLAG_COPIED (1LL << 63) +#define QCOW_OFLAG_COPIED (1ULL << 63) /* indicate that the cluster is compressed (they never have the copied flag) */ -#define QCOW_OFLAG_COMPRESSED (1LL << 62) +#define QCOW_OFLAG_COMPRESSED (1ULL << 62) /* The cluster reads as all zeros */ -#define QCOW_OFLAG_ZERO (1LL << 0) +#define QCOW_OFLAG_ZERO (1ULL << 0) #define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */ @@ -63,6 +76,15 @@ #define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" #define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" #define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" +#define QCOW2_OPT_OVERLAP "overlap-check" +#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" +#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" +#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" +#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table" +#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block" +#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table" +#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1" +#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2" typedef struct QCowHeader { uint32_t magic; @@ -86,7 +108,33 @@ typedef struct QCowHeader { uint32_t refcount_order; uint32_t header_length; -} QCowHeader; +} QEMU_PACKED QCowHeader; + +typedef struct QEMU_PACKED QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} QCowSnapshotHeader; + +typedef struct QEMU_PACKED QCowSnapshotExtraData { + uint64_t vm_state_size_large; + uint64_t disk_size; +} QCowSnapshotExtraData; + typedef struct QCowSnapshot { uint64_t l1_table_offset; @@ -119,9 +167,12 @@ enum { /* Incompatible feature bits */ enum { QCOW2_INCOMPAT_DIRTY_BITNR = 0, + QCOW2_INCOMPAT_CORRUPT_BITNR = 1, QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + QCOW2_INCOMPAT_CORRUPT = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR, - QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY, + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY + | QCOW2_INCOMPAT_CORRUPT, }; /* Compatible feature bits */ @@ -179,8 +230,8 @@ typedef struct BDRVQcowState { uint64_t *refcount_table; uint64_t refcount_table_offset; uint32_t refcount_table_size; - int64_t free_cluster_index; - int64_t free_byte_offset; + uint64_t free_cluster_index; + uint64_t free_byte_offset; CoMutex lock; @@ -190,15 +241,18 @@ typedef struct BDRVQcowState { AES_KEY aes_decrypt_key; uint64_t snapshots_offset; int snapshots_size; - int nb_snapshots; + unsigned int nb_snapshots; QCowSnapshot *snapshots; int flags; int qcow_version; bool use_lazy_refcounts; + int refcount_order; bool discard_passthrough[QCOW2_DISCARD_MAX]; + int overlap_check; /* bitmask of Qcow2MetadataOverlap values */ + uint64_t incompatible_features; uint64_t compatible_features; uint64_t autoclear_features; @@ -286,11 +340,50 @@ enum { QCOW2_CLUSTER_ZERO }; -#define L1E_OFFSET_MASK 0x00ffffffffffff00ULL -#define L2E_OFFSET_MASK 0x00ffffffffffff00ULL +typedef enum QCow2MetadataOverlap { + QCOW2_OL_MAIN_HEADER_BITNR = 0, + QCOW2_OL_ACTIVE_L1_BITNR = 1, + QCOW2_OL_ACTIVE_L2_BITNR = 2, + QCOW2_OL_REFCOUNT_TABLE_BITNR = 3, + QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4, + QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5, + QCOW2_OL_INACTIVE_L1_BITNR = 6, + QCOW2_OL_INACTIVE_L2_BITNR = 7, + + QCOW2_OL_MAX_BITNR = 8, + + QCOW2_OL_NONE = 0, + QCOW2_OL_MAIN_HEADER = (1 << QCOW2_OL_MAIN_HEADER_BITNR), + QCOW2_OL_ACTIVE_L1 = (1 << QCOW2_OL_ACTIVE_L1_BITNR), + QCOW2_OL_ACTIVE_L2 = (1 << QCOW2_OL_ACTIVE_L2_BITNR), + QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR), + QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR), + QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR), + QCOW2_OL_INACTIVE_L1 = (1 << QCOW2_OL_INACTIVE_L1_BITNR), + /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv + * reads. */ + QCOW2_OL_INACTIVE_L2 = (1 << QCOW2_OL_INACTIVE_L2_BITNR), +} QCow2MetadataOverlap; + +/* Perform all overlap checks which can be done in constant time */ +#define QCOW2_OL_CONSTANT \ + (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \ + QCOW2_OL_SNAPSHOT_TABLE) + +/* Perform all overlap checks which don't require disk access */ +#define QCOW2_OL_CACHED \ + (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \ + QCOW2_OL_INACTIVE_L1) + +/* Perform all overlap checks */ +#define QCOW2_OL_ALL \ + (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) + +#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL -#define REFT_OFFSET_MASK 0xffffffffffffff00ULL +#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) { @@ -324,6 +417,16 @@ static inline int64_t align_offset(int64_t offset, int n) return offset; } +static inline int64_t qcow2_vm_state_offset(BDRVQcowState *s) +{ + return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); +} + +static inline uint64_t qcow2_max_refcount_clusters(BDRVQcowState *s) +{ + return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits; +} + static inline int qcow2_get_cluster_type(uint64_t l2_entry) { if (l2_entry & QCOW_OFLAG_COMPRESSED) { @@ -361,13 +464,18 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, int64_t sector_num, int nb_sectors); int qcow2_mark_dirty(BlockDriverState *bs); +int qcow2_mark_corrupt(BlockDriverState *bs); +int qcow2_mark_consistent(BlockDriverState *bs); int qcow2_update_header(BlockDriverState *bs); /* qcow2-refcount.c functions */ int qcow2_refcount_init(BlockDriverState *bs); void qcow2_refcount_close(BlockDriverState *bs); -int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size); +int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, + int addend, enum qcow2_discard_type type); + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, int nb_clusters); int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); @@ -385,9 +493,15 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, void qcow2_process_discards(BlockDriverState *bs, int ret); +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, + int64_t size); +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, + int64_t size); + /* qcow2-cluster.c functions */ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, bool exact_size); +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); void qcow2_l2_cache_reset(BlockDriverState *bs); int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, @@ -398,22 +512,30 @@ void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, int *num, uint64_t *cluster_offset); int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int n_start, int n_end, int *num, uint64_t *host_offset, QCowL2Meta **m); + int *num, uint64_t *host_offset, QCowL2Meta **m); uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size); int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors); + int nb_sectors, enum qcow2_discard_type type); int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); +int qcow2_expand_zero_clusters(BlockDriverState *bs); + /* qcow2-snapshot.c functions */ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); -int qcow2_snapshot_delete(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); -int qcow2_snapshot_load_tmp(BlockDriverState *bs, const char *snapshot_name); +int qcow2_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); void qcow2_free_snapshots(BlockDriverState *bs); int qcow2_read_snapshots(BlockDriverState *bs); @@ -428,6 +550,8 @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, Qcow2Cache *dependency); void qcow2_cache_depends_on_flush(Qcow2Cache *c); +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c); + int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table); int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, diff --git a/block/qed.c b/block/qed.c index f767b0528c..c130e42d0d 100644 --- a/block/qed.c +++ b/block/qed.c @@ -353,10 +353,10 @@ static void qed_start_need_check_timer(BDRVQEDState *s) { trace_qed_start_need_check_timer(s); - /* Use vm_clock so we don't alter the image file while suspended for + /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for * migration. */ - qemu_mod_timer(s->need_check_timer, qemu_get_clock_ns(vm_clock) + + timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); } @@ -364,7 +364,7 @@ static void qed_start_need_check_timer(BDRVQEDState *s) static void qed_cancel_need_check_timer(BDRVQEDState *s) { trace_qed_cancel_need_check_timer(s); - qemu_del_timer(s->need_check_timer); + timer_del(s->need_check_timer); } static void bdrv_qed_rebind(BlockDriverState *bs) @@ -373,7 +373,8 @@ static void bdrv_qed_rebind(BlockDriverState *bs) s->bs = bs; } -static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVQEDState *s = bs->opaque; QEDHeader le_header; @@ -390,14 +391,15 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) qed_header_le_to_cpu(&le_header, &s->header); if (s->header.magic != QED_MAGIC) { - return -EMEDIUMTYPE; + error_setg(errp, "Image not in QED format"); + return -EINVAL; } if (s->header.features & ~QED_FEATURE_MASK) { /* image uses unsupported feature bits */ char buf[64]; snprintf(buf, sizeof(buf), "%" PRIx64, s->header.features & ~QED_FEATURE_MASK); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, bs->device_name, "QED", buf); return -ENOTSUP; } @@ -494,7 +496,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) } } - s->need_check_timer = qemu_new_timer_ns(vm_clock, + s->need_check_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, qed_need_check_timer_cb, s); out: @@ -505,6 +507,15 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags) return ret; } +static int bdrv_qed_refresh_limits(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; + + return 0; +} + /* We have nothing to do for QED reopen, stubs just return * success */ static int bdrv_qed_reopen_prepare(BDRVReopenState *state, @@ -518,7 +529,7 @@ static void bdrv_qed_close(BlockDriverState *bs) BDRVQEDState *s = bs->opaque; qed_cancel_need_check_timer(s); - qemu_free_timer(s->need_check_timer); + timer_free(s->need_check_timer); /* Ensure writes reach stable storage */ bdrv_flush(bs->file); @@ -535,7 +546,8 @@ static void bdrv_qed_close(BlockDriverState *bs) static int qed_create(const char *filename, uint32_t cluster_size, uint64_t image_size, uint32_t table_size, - const char *backing_file, const char *backing_fmt) + const char *backing_file, const char *backing_fmt, + Error **errp) { QEDHeader header = { .magic = QED_MAGIC, @@ -550,16 +562,22 @@ static int qed_create(const char *filename, uint32_t cluster_size, QEDHeader le_header; uint8_t *l1_table = NULL; size_t l1_size = header.cluster_size * header.table_size; + Error *local_err = NULL; int ret = 0; - BlockDriverState *bs = NULL; + BlockDriverState *bs; - ret = bdrv_create_file(filename, NULL); + ret = bdrv_create_file(filename, NULL, &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR | BDRV_O_CACHE_WB); + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, NULL, + &local_err); if (ret < 0) { + error_propagate(errp, local_err); return ret; } @@ -599,11 +617,12 @@ static int qed_create(const char *filename, uint32_t cluster_size, ret = 0; /* success */ out: g_free(l1_table); - bdrv_delete(bs); + bdrv_unref(bs); return ret; } -static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) +static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { uint64_t image_size = 0; uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; @@ -631,71 +650,89 @@ static int bdrv_qed_create(const char *filename, QEMUOptionParameter *options) } if (!qed_is_cluster_size_valid(cluster_size)) { - fprintf(stderr, "QED cluster size must be within range [%u, %u] and power of 2\n", - QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); + error_setg(errp, "QED cluster size must be within range [%u, %u] " + "and power of 2", + QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); return -EINVAL; } if (!qed_is_table_size_valid(table_size)) { - fprintf(stderr, "QED table size must be within range [%u, %u] and power of 2\n", - QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); + error_setg(errp, "QED table size must be within range [%u, %u] " + "and power of 2", + QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); return -EINVAL; } if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { - fprintf(stderr, "QED image size must be a non-zero multiple of " - "cluster size and less than %" PRIu64 " bytes\n", - qed_max_image_size(cluster_size, table_size)); + error_setg(errp, "QED image size must be a non-zero multiple of " + "cluster size and less than %" PRIu64 " bytes", + qed_max_image_size(cluster_size, table_size)); return -EINVAL; } return qed_create(filename, cluster_size, image_size, table_size, - backing_file, backing_fmt); + backing_file, backing_fmt, errp); } typedef struct { + BlockDriverState *bs; Coroutine *co; - int is_allocated; + uint64_t pos; + int64_t status; int *pnum; } QEDIsAllocatedCB; static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) { QEDIsAllocatedCB *cb = opaque; + BDRVQEDState *s = cb->bs->opaque; *cb->pnum = len / BDRV_SECTOR_SIZE; - cb->is_allocated = (ret == QED_CLUSTER_FOUND || ret == QED_CLUSTER_ZERO); + switch (ret) { + case QED_CLUSTER_FOUND: + offset |= qed_offset_into_cluster(s, cb->pos); + cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + break; + case QED_CLUSTER_ZERO: + cb->status = BDRV_BLOCK_ZERO; + break; + case QED_CLUSTER_L2: + case QED_CLUSTER_L1: + cb->status = 0; + break; + default: + assert(ret < 0); + cb->status = ret; + break; + } + if (cb->co) { qemu_coroutine_enter(cb->co, NULL); } } -static int coroutine_fn bdrv_qed_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVQEDState *s = bs->opaque; - uint64_t pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; QEDIsAllocatedCB cb = { - .is_allocated = -1, + .bs = bs, + .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE, + .status = BDRV_BLOCK_OFFSET_MASK, .pnum = pnum, }; QEDRequest request = { .l2_table = NULL }; - qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb); + qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb); /* Now sleep if the callback wasn't invoked immediately */ - while (cb.is_allocated == -1) { + while (cb.status == BDRV_BLOCK_OFFSET_MASK) { cb.co = qemu_coroutine_self(); qemu_coroutine_yield(); } qed_unref_l2_cache_entry(request.l2_table); - return cb.is_allocated; -} - -static int bdrv_qed_make_empty(BlockDriverState *bs) -{ - return -ENOTSUP; + return cb.status; } static BDRVQEDState *acb_to_s(QEDAIOCB *acb) @@ -1368,7 +1405,8 @@ static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) + int nb_sectors, + BdrvRequestFlags flags) { BlockDriverAIOCB *blockacb; BDRVQEDState *s = bs->opaque; @@ -1445,6 +1483,8 @@ static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) memset(bdi, 0, sizeof(*bdi)); bdi->cluster_size = s->header.cluster_size; bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; + bdi->unallocated_blocks_are_zero = true; + bdi->can_write_zeroes_with_unmap = true; return 0; } @@ -1520,13 +1560,31 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs, return ret; } -static void bdrv_qed_invalidate_cache(BlockDriverState *bs) +static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) { BDRVQEDState *s = bs->opaque; + Error *local_err = NULL; + int ret; bdrv_qed_close(bs); + + bdrv_invalidate_cache(bs->file, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + memset(s, 0, sizeof(BDRVQEDState)); - bdrv_qed_open(bs, NULL, bs->open_flags); + ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); + if (local_err) { + error_setg(errp, "Could not reopen qed layer: %s", + error_get_pretty(local_err)); + error_free(local_err); + return; + } else if (ret < 0) { + error_setg_errno(errp, -ret, "Could not reopen qed layer"); + return; + } } static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, @@ -1575,14 +1633,14 @@ static BlockDriver bdrv_qed = { .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, .bdrv_create = bdrv_qed_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = bdrv_qed_co_is_allocated, - .bdrv_make_empty = bdrv_qed_make_empty, + .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, .bdrv_aio_readv = bdrv_qed_aio_readv, .bdrv_aio_writev = bdrv_qed_aio_writev, .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, .bdrv_truncate = bdrv_qed_truncate, .bdrv_getlength = bdrv_qed_getlength, .bdrv_get_info = bdrv_qed_get_info, + .bdrv_refresh_limits = bdrv_qed_refresh_limits, .bdrv_change_backing_file = bdrv_qed_change_backing_file, .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, .bdrv_check = bdrv_qed_check, diff --git a/block/qed.h b/block/qed.h index 2b4ddedf31..5d65bea075 100644 --- a/block/qed.h +++ b/block/qed.h @@ -100,7 +100,7 @@ typedef struct { /* if (features & QED_F_BACKING_FILE) */ uint32_t backing_filename_offset; /* in bytes from start of header */ uint32_t backing_filename_size; /* in bytes */ -} QEDHeader; +} QEMU_PACKED QEDHeader; typedef struct { uint64_t offsets[0]; /* in bytes */ diff --git a/block/quorum.c b/block/quorum.c new file mode 100644 index 0000000000..ecec3a5407 --- /dev/null +++ b/block/quorum.c @@ -0,0 +1,877 @@ +/* + * Quorum Block filter + * + * Copyright (C) 2012-2014 Nodalink, EURL. + * + * Author: + * Benoît Canet + * + * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp) + * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc). + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include +#include +#include "block/block_int.h" +#include "qapi/qmp/qjson.h" + +#define HASH_LENGTH 32 + +#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" +#define QUORUM_OPT_BLKVERIFY "blkverify" + +/* This union holds a vote hash value */ +typedef union QuorumVoteValue { + char h[HASH_LENGTH]; /* SHA-256 hash */ + int64_t l; /* simpler 64 bits hash */ +} QuorumVoteValue; + +/* A vote item */ +typedef struct QuorumVoteItem { + int index; + QLIST_ENTRY(QuorumVoteItem) next; +} QuorumVoteItem; + +/* this structure is a vote version. A version is the set of votes sharing the + * same vote value. + * The set of votes will be tracked with the items field and its cardinality is + * vote_count. + */ +typedef struct QuorumVoteVersion { + QuorumVoteValue value; + int index; + int vote_count; + QLIST_HEAD(, QuorumVoteItem) items; + QLIST_ENTRY(QuorumVoteVersion) next; +} QuorumVoteVersion; + +/* this structure holds a group of vote versions together */ +typedef struct QuorumVotes { + QLIST_HEAD(, QuorumVoteVersion) vote_list; + bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); +} QuorumVotes; + +/* the following structure holds the state of one quorum instance */ +typedef struct BDRVQuorumState { + BlockDriverState **bs; /* children BlockDriverStates */ + int num_children; /* children count */ + int threshold; /* if less than threshold children reads gave the + * same result a quorum error occurs. + */ + bool is_blkverify; /* true if the driver is in blkverify mode + * Writes are mirrored on two children devices. + * On reads the two children devices' contents are + * compared and if a difference is spotted its + * location is printed and the code aborts. + * It is useful to debug other block drivers by + * comparing them with a reference one. + */ +} BDRVQuorumState; + +typedef struct QuorumAIOCB QuorumAIOCB; + +/* Quorum will create one instance of the following structure per operation it + * performs on its children. + * So for each read/write operation coming from the upper layer there will be + * $children_count QuorumChildRequest. + */ +typedef struct QuorumChildRequest { + BlockDriverAIOCB *aiocb; + QEMUIOVector qiov; + uint8_t *buf; + int ret; + QuorumAIOCB *parent; +} QuorumChildRequest; + +/* Quorum will use the following structure to track progress of each read/write + * operation received by the upper layer. + * This structure hold pointers to the QuorumChildRequest structures instances + * used to do operations on each children and track overall progress. + */ +struct QuorumAIOCB { + BlockDriverAIOCB common; + + /* Request metadata */ + uint64_t sector_num; + int nb_sectors; + + QEMUIOVector *qiov; /* calling IOV */ + + QuorumChildRequest *qcrs; /* individual child requests */ + int count; /* number of completed AIOCB */ + int success_count; /* number of successfully completed AIOCB */ + + QuorumVotes votes; + + bool is_read; + int vote_ret; +}; + +static void quorum_vote(QuorumAIOCB *acb); + +static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) +{ + QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); + BDRVQuorumState *s = acb->common.bs->opaque; + int i; + + /* cancel all callbacks */ + for (i = 0; i < s->num_children; i++) { + bdrv_aio_cancel(acb->qcrs[i].aiocb); + } + + g_free(acb->qcrs); + qemu_aio_release(acb); +} + +static AIOCBInfo quorum_aiocb_info = { + .aiocb_size = sizeof(QuorumAIOCB), + .cancel = quorum_aio_cancel, +}; + +static void quorum_aio_finalize(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + int i, ret = 0; + + if (acb->vote_ret) { + ret = acb->vote_ret; + } + + acb->common.cb(acb->common.opaque, ret); + + if (acb->is_read) { + for (i = 0; i < s->num_children; i++) { + qemu_vfree(acb->qcrs[i].buf); + qemu_iovec_destroy(&acb->qcrs[i].qiov); + } + } + + g_free(acb->qcrs); + qemu_aio_release(acb); +} + +static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return !memcmp(a->h, b->h, HASH_LENGTH); +} + +static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return a->l == b->l; +} + +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, + BlockDriverState *bs, + QEMUIOVector *qiov, + uint64_t sector_num, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); + int i; + + acb->common.bs->opaque = s; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + acb->qiov = qiov; + acb->qcrs = g_new0(QuorumChildRequest, s->num_children); + acb->count = 0; + acb->success_count = 0; + acb->votes.compare = quorum_sha256_compare; + QLIST_INIT(&acb->votes.vote_list); + acb->is_read = false; + acb->vote_ret = 0; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].buf = NULL; + acb->qcrs[i].ret = 0; + acb->qcrs[i].parent = acb; + } + + return acb; +} + +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +{ + QObject *data; + assert(node_name); + data = qobject_from_jsonf("{ 'node-name': %s" + ", 'sector-num': %" PRId64 + ", 'sectors-count': %d }", + node_name, acb->sector_num, acb->nb_sectors); + if (ret < 0) { + QDict *dict = qobject_to_qdict(data); + qdict_put(dict, "error", qstring_from_str(strerror(-ret))); + } + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); + qobject_decref(data); +} + +static void quorum_report_failure(QuorumAIOCB *acb) +{ + QObject *data; + const char *reference = acb->common.bs->device_name[0] ? + acb->common.bs->device_name : + acb->common.bs->node_name; + data = qobject_from_jsonf("{ 'reference': %s" + ", 'sector-num': %" PRId64 + ", 'sectors-count': %d }", + reference, acb->sector_num, acb->nb_sectors); + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); + qobject_decref(data); +} + +static int quorum_vote_error(QuorumAIOCB *acb); + +static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + + if (acb->success_count < s->threshold) { + acb->vote_ret = quorum_vote_error(acb); + quorum_report_failure(acb); + return true; + } + + return false; +} + +static void quorum_aio_cb(void *opaque, int ret) +{ + QuorumChildRequest *sacb = opaque; + QuorumAIOCB *acb = sacb->parent; + BDRVQuorumState *s = acb->common.bs->opaque; + + sacb->ret = ret; + acb->count++; + if (ret == 0) { + acb->success_count++; + } else { + quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); + } + assert(acb->count <= s->num_children); + assert(acb->success_count <= s->num_children); + if (acb->count < s->num_children) { + return; + } + + /* Do the vote on read */ + if (acb->is_read) { + quorum_vote(acb); + } else { + quorum_has_too_much_io_failed(acb); + } + + quorum_aio_finalize(acb); +} + +static void quorum_report_bad_versions(BDRVQuorumState *s, + QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + quorum_report_bad(acb, s->bs[item->index]->node_name, 0); + } + } +} + +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) +{ + int i; + assert(dest->niov == source->niov); + assert(dest->size == source->size); + for (i = 0; i < source->niov; i++) { + assert(dest->iov[i].iov_len == source->iov[i].iov_len); + memcpy(dest->iov[i].iov_base, + source->iov[i].iov_base, + source->iov[i].iov_len); + } +} + +static void quorum_count_vote(QuorumVotes *votes, + QuorumVoteValue *value, + int index) +{ + QuorumVoteVersion *v = NULL, *version = NULL; + QuorumVoteItem *item; + + /* look if we have something with this hash */ + QLIST_FOREACH(v, &votes->vote_list, next) { + if (votes->compare(&v->value, value)) { + version = v; + break; + } + } + + /* It's a version not yet in the list add it */ + if (!version) { + version = g_new0(QuorumVoteVersion, 1); + QLIST_INIT(&version->items); + memcpy(&version->value, value, sizeof(version->value)); + version->index = index; + version->vote_count = 0; + QLIST_INSERT_HEAD(&votes->vote_list, version, next); + } + + version->vote_count++; + + item = g_new0(QuorumVoteItem, 1); + item->index = index; + QLIST_INSERT_HEAD(&version->items, item, next); +} + +static void quorum_free_vote_list(QuorumVotes *votes) +{ + QuorumVoteVersion *version, *next_version; + QuorumVoteItem *item, *next_item; + + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { + QLIST_REMOVE(version, next); + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { + QLIST_REMOVE(item, next); + g_free(item); + } + g_free(version); + } +} + +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) +{ + int j, ret; + gnutls_hash_hd_t dig; + QEMUIOVector *qiov = &acb->qcrs[i].qiov; + + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); + + if (ret < 0) { + return ret; + } + + for (j = 0; j < qiov->niov; j++) { + ret = gnutls_hash(dig, qiov->iov[j].iov_base, qiov->iov[j].iov_len); + if (ret < 0) { + break; + } + } + + gnutls_hash_deinit(dig, (void *) hash); + return ret; +} + +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) +{ + int max = 0; + QuorumVoteVersion *candidate, *winner = NULL; + + QLIST_FOREACH(candidate, &votes->vote_list, next) { + if (candidate->vote_count > max) { + max = candidate->vote_count; + winner = candidate; + } + } + + return winner; +} + +/* qemu_iovec_compare is handy for blkverify mode because it returns the first + * differing byte location. Yet it is handcoded to compare vectors one byte + * after another so it does not benefit from the libc SIMD optimizations. + * quorum_iovec_compare is written for speed and should be used in the non + * blkverify mode of quorum. + */ +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ + int i; + int result; + + assert(a->niov == b->niov); + for (i = 0; i < a->niov; i++) { + assert(a->iov[i].iov_len == b->iov[i].iov_len); + result = memcmp(a->iov[i].iov_base, + b->iov[i].iov_base, + a->iov[i].iov_len); + if (result) { + return false; + } + } + + return true; +} + +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", + acb->sector_num, acb->nb_sectors); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(1); +} + +static bool quorum_compare(QuorumAIOCB *acb, + QEMUIOVector *a, + QEMUIOVector *b) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + ssize_t offset; + + /* This driver will replace blkverify in this particular case */ + if (s->is_blkverify) { + offset = qemu_iovec_compare(a, b); + if (offset != -1) { + quorum_err(acb, "contents mismatch in sector %" PRId64, + acb->sector_num + + (uint64_t)(offset / BDRV_SECTOR_SIZE)); + } + return true; + } + + return quorum_iovec_compare(a, b); +} + +/* Do a vote to get the error code */ +static int quorum_vote_error(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner = NULL; + QuorumVotes error_votes; + QuorumVoteValue result_value; + int i, ret = 0; + bool error = false; + + QLIST_INIT(&error_votes.vote_list); + error_votes.compare = quorum_64bits_compare; + + for (i = 0; i < s->num_children; i++) { + ret = acb->qcrs[i].ret; + if (ret) { + error = true; + result_value.l = ret; + quorum_count_vote(&error_votes, &result_value, i); + } + } + + if (error) { + winner = quorum_get_vote_winner(&error_votes); + ret = winner->value.l; + } + + quorum_free_vote_list(&error_votes); + + return ret; +} + +static void quorum_vote(QuorumAIOCB *acb) +{ + bool quorum = true; + int i, j, ret; + QuorumVoteValue hash; + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner; + + if (quorum_has_too_much_io_failed(acb)) { + return; + } + + /* get the index of the first successful read */ + for (i = 0; i < s->num_children; i++) { + if (!acb->qcrs[i].ret) { + break; + } + } + + assert(i < s->num_children); + + /* compare this read with all other successful reads stopping at quorum + * failure + */ + for (j = i + 1; j < s->num_children; j++) { + if (acb->qcrs[j].ret) { + continue; + } + quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov); + if (!quorum) { + break; + } + } + + /* Every successful read agrees */ + if (quorum) { + quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); + return; + } + + /* compute hashes for each successful read, also store indexes */ + for (i = 0; i < s->num_children; i++) { + if (acb->qcrs[i].ret) { + continue; + } + ret = quorum_compute_hash(acb, i, &hash); + /* if ever the hash computation failed */ + if (ret < 0) { + acb->vote_ret = ret; + goto free_exit; + } + quorum_count_vote(&acb->votes, &hash, i); + } + + /* vote to select the most represented version */ + winner = quorum_get_vote_winner(&acb->votes); + + /* if the winner count is smaller than threshold the read fails */ + if (winner->vote_count < s->threshold) { + quorum_report_failure(acb); + acb->vote_ret = -EIO; + goto free_exit; + } + + /* we have a winner: copy it */ + quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov); + + /* some versions are bad print them */ + quorum_report_bad_versions(s, acb, &winner->value); + +free_exit: + /* free lists */ + quorum_free_vote_list(&acb->votes); +} + +static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, + nb_sectors, cb, opaque); + int i; + + acb->is_read = true; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size); + qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov); + qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf); + } + + for (i = 0; i < s->num_children; i++) { + bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors, + quorum_aio_cb, &acb->qcrs[i]); + } + + return &acb->common; +} + +static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockDriverCompletionFunc *cb, + void *opaque) +{ + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, + cb, opaque); + int i; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov, + nb_sectors, &quorum_aio_cb, + &acb->qcrs[i]); + } + + return &acb->common; +} + +static int64_t quorum_getlength(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int64_t result; + int i; + + /* check that all file have the same length */ + result = bdrv_getlength(s->bs[0]); + if (result < 0) { + return result; + } + for (i = 1; i < s->num_children; i++) { + int64_t value = bdrv_getlength(s->bs[i]); + if (value < 0) { + return value; + } + if (value != result) { + return -EIO; + } + } + + return result; +} + +static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + Error *local_err = NULL; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_invalidate_cache(s->bs[i], &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } +} + +static coroutine_fn int quorum_co_flush(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + QuorumVoteVersion *winner = NULL; + QuorumVotes error_votes; + QuorumVoteValue result_value; + int i; + int result = 0; + + QLIST_INIT(&error_votes.vote_list); + error_votes.compare = quorum_64bits_compare; + + for (i = 0; i < s->num_children; i++) { + result = bdrv_co_flush(s->bs[i]); + result_value.l = result; + quorum_count_vote(&error_votes, &result_value, i); + } + + winner = quorum_get_vote_winner(&error_votes); + result = winner->value.l; + + quorum_free_vote_list(&error_votes); + + return result; +} + +static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bool perm = bdrv_recurse_is_first_non_filter(s->bs[i], + candidate); + if (perm) { + return true; + } + } + + return false; +} + +static int quorum_valid_threshold(int threshold, int num_children, Error **errp) +{ + + if (threshold < 1) { + error_set(errp, QERR_INVALID_PARAMETER_VALUE, + "vote-threshold", "value >= 1"); + return -ERANGE; + } + + if (threshold > num_children) { + error_setg(errp, "threshold may not exceed children count"); + return -ERANGE; + } + + return 0; +} + +static QemuOptsList quorum_runtime_opts = { + .name = "quorum", + .head = QTAILQ_HEAD_INITIALIZER(quorum_runtime_opts.head), + .desc = { + { + .name = QUORUM_OPT_VOTE_THRESHOLD, + .type = QEMU_OPT_NUMBER, + .help = "The number of vote needed for reaching quorum", + }, + { + .name = QUORUM_OPT_BLKVERIFY, + .type = QEMU_OPT_BOOL, + .help = "Trigger block verify mode if set", + }, + { /* end of list */ } + }, +}; + +static int quorum_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + Error *local_err = NULL; + QemuOpts *opts; + bool *opened; + QDict *sub = NULL; + QList *list = NULL; + const QListEntry *lentry; + int i; + int ret = 0; + + qdict_flatten(options); + qdict_extract_subqdict(options, &sub, "children."); + qdict_array_split(sub, &list); + + if (qdict_size(sub)) { + error_setg(&local_err, "Invalid option children.%s", + qdict_first(sub)->key); + ret = -EINVAL; + goto exit; + } + + /* count how many different children are present */ + s->num_children = qlist_size(list); + if (s->num_children < 2) { + error_setg(&local_err, + "Number of provided children must be greater than 1"); + ret = -EINVAL; + goto exit; + } + + opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + ret = -EINVAL; + goto exit; + } + + s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); + + /* and validate it against s->num_children */ + ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); + if (ret < 0) { + goto exit; + } + + /* is the driver in blkverify mode */ + if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && + s->num_children == 2 && s->threshold == 2) { + s->is_blkverify = true; + } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { + fprintf(stderr, "blkverify mode is set by setting blkverify=on " + "and using two files with vote_threshold=2\n"); + } + + /* allocate the children BlockDriverState array */ + s->bs = g_new0(BlockDriverState *, s->num_children); + opened = g_new0(bool, s->num_children); + + for (i = 0, lentry = qlist_first(list); lentry; + lentry = qlist_next(lentry), i++) { + QDict *d; + QString *string; + + switch (qobject_type(lentry->value)) + { + /* List of options */ + case QTYPE_QDICT: + d = qobject_to_qdict(lentry->value); + QINCREF(d); + ret = bdrv_open(&s->bs[i], NULL, NULL, d, flags, NULL, + &local_err); + break; + + /* QMP reference */ + case QTYPE_QSTRING: + string = qobject_to_qstring(lentry->value); + ret = bdrv_open(&s->bs[i], NULL, qstring_get_str(string), NULL, + flags, NULL, &local_err); + break; + + default: + error_setg(&local_err, "Specification of child block device %i " + "is invalid", i); + ret = -EINVAL; + } + + if (ret < 0) { + goto close_exit; + } + opened[i] = true; + } + + g_free(opened); + goto exit; + +close_exit: + /* cleanup on error */ + for (i = 0; i < s->num_children; i++) { + if (!opened[i]) { + continue; + } + bdrv_unref(s->bs[i]); + } + g_free(s->bs); + g_free(opened); +exit: + /* propagate error */ + if (local_err) { + error_propagate(errp, local_err); + } + QDECREF(list); + QDECREF(sub); + return ret; +} + +static void quorum_close(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_unref(s->bs[i]); + } + + g_free(s->bs); +} + +static BlockDriver bdrv_quorum = { + .format_name = "quorum", + .protocol_name = "quorum", + + .instance_size = sizeof(BDRVQuorumState), + + .bdrv_file_open = quorum_open, + .bdrv_close = quorum_close, + + .bdrv_co_flush_to_disk = quorum_co_flush, + + .bdrv_getlength = quorum_getlength, + + .bdrv_aio_readv = quorum_aio_readv, + .bdrv_aio_writev = quorum_aio_writev, + .bdrv_invalidate_cache = quorum_invalidate_cache, + + .is_filter = true, + .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, +}; + +static void bdrv_quorum_init(void) +{ + bdrv_register(&bdrv_quorum); +} + +block_init(bdrv_quorum_init); diff --git a/block/raw-aio.h b/block/raw-aio.h index c61f1595d9..7ad0a8a0a7 100644 --- a/block/raw-aio.h +++ b/block/raw-aio.h @@ -21,9 +21,10 @@ #define QEMU_AIO_IOCTL 0x0004 #define QEMU_AIO_FLUSH 0x0008 #define QEMU_AIO_DISCARD 0x0010 +#define QEMU_AIO_WRITE_ZEROES 0x0020 #define QEMU_AIO_TYPE_MASK \ (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ - QEMU_AIO_DISCARD) + QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) /* AIO flags */ #define QEMU_AIO_MISALIGNED 0x1000 diff --git a/block/raw-posix.c b/block/raw-posix.c index ba721d3f5b..6586a0c9e1 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -127,6 +127,8 @@ typedef struct BDRVRawState { int fd; int type; int open_flags; + size_t buf_align; + #if defined(__linux__) /* linux floppy specific */ int64_t fd_open_time; @@ -139,9 +141,14 @@ typedef struct BDRVRawState { void *aio_ctx; #endif #ifdef CONFIG_XFS - bool is_xfs : 1; + bool is_xfs:1; +#endif + bool has_discard:1; + bool has_write_zeroes:1; + bool discard_zeroes:1; +#ifdef CONFIG_FIEMAP + bool skip_fiemap; #endif - bool has_discard : 1; } BDRVRawState; typedef struct BDRVRawReopenState { @@ -211,6 +218,76 @@ static int raw_normalize_devicepath(const char **filename) } #endif +static void raw_probe_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + char *buf; + unsigned int sector_size; + + /* For /dev/sg devices the alignment is not really used. + With buffered I/O, we don't have any restrictions. */ + if (bs->sg || !(s->open_flags & O_DIRECT)) { + bs->request_alignment = 1; + s->buf_align = 1; + return; + } + + /* Try a few ioctls to get the right size */ + bs->request_alignment = 0; + s->buf_align = 0; + +#ifdef BLKSSZGET + if (ioctl(s->fd, BLKSSZGET, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef DKIOCGETBLOCKSIZE + if (ioctl(s->fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef DIOCGSECTORSIZE + if (ioctl(s->fd, DIOCGSECTORSIZE, §or_size) >= 0) { + bs->request_alignment = sector_size; + } +#endif +#ifdef CONFIG_XFS + if (s->is_xfs) { + struct dioattr da; + if (xfsctl(NULL, s->fd, XFS_IOC_DIOINFO, &da) >= 0) { + bs->request_alignment = da.d_miniosz; + /* The kernel returns wrong information for d_mem */ + /* s->buf_align = da.d_mem; */ + } + } +#endif + + /* If we could not get the sizes so far, we can only guess them */ + if (!s->buf_align) { + size_t align; + buf = qemu_memalign(MAX_BLOCKSIZE, 2 * MAX_BLOCKSIZE); + for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + if (pread(s->fd, buf + align, MAX_BLOCKSIZE, 0) >= 0) { + s->buf_align = align; + break; + } + } + qemu_vfree(buf); + } + + if (!bs->request_alignment) { + size_t align; + buf = qemu_memalign(s->buf_align, MAX_BLOCKSIZE); + for (align = 512; align <= MAX_BLOCKSIZE; align <<= 1) { + if (pread(s->fd, buf, align, 0) >= 0) { + bs->request_alignment = align; + break; + } + } + qemu_vfree(buf); + } +} + static void raw_parse_flags(int bdrv_flags, int *open_flags) { assert(open_flags != NULL); @@ -262,6 +339,17 @@ static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags) } #endif +static void raw_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The filename does not have to be prefixed by the protocol name, since + * "file" is the default protocol; therefore, the return value of this + * function call can be ignored. */ + strstart(filename, "file:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + static QemuOptsList raw_runtime_opts = { .name = "raw", .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), @@ -276,19 +364,19 @@ static QemuOptsList raw_runtime_opts = { }; static int raw_open_common(BlockDriverState *bs, QDict *options, - int bdrv_flags, int open_flags) + int bdrv_flags, int open_flags, Error **errp) { BDRVRawState *s = bs->opaque; QemuOpts *opts; Error *local_err = NULL; - const char *filename; + const char *filename = NULL; int fd, ret; + struct stat st; - opts = qemu_opts_create_nofail(&raw_runtime_opts); + opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -297,6 +385,7 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = raw_normalize_devicepath(&filename); if (ret != 0) { + error_setg_errno(errp, -ret, "Could not normalize device path"); goto fail; } @@ -318,29 +407,68 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) { qemu_close(fd); ret = -errno; + error_setg_errno(errp, -ret, "Could not set AIO state"); goto fail; } #endif - s->has_discard = 1; + s->has_discard = true; + s->has_write_zeroes = true; + + if (fstat(s->fd, &st) < 0) { + error_setg_errno(errp, errno, "Could not stat file"); + goto fail; + } + if (S_ISREG(st.st_mode)) { + s->discard_zeroes = true; + } + if (S_ISBLK(st.st_mode)) { +#ifdef BLKDISCARDZEROES + unsigned int arg; + if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { + s->discard_zeroes = true; + } +#endif +#ifdef __linux__ + /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do + * not rely on the contents of discarded blocks unless using O_DIRECT. + * Same for BLKZEROOUT. + */ + if (!(bs->open_flags & BDRV_O_NOCACHE)) { + s->discard_zeroes = false; + s->has_write_zeroes = false; + } +#endif + } + #ifdef CONFIG_XFS if (platform_test_xfs_fd(s->fd)) { - s->is_xfs = 1; + s->is_xfs = true; } #endif ret = 0; fail: + if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { + unlink(filename); + } qemu_opts_del(opts); return ret; } -static int raw_open(BlockDriverState *bs, QDict *options, int flags) +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; s->type = FTYPE_FILE; - return raw_open_common(bs, options, flags, 0); + ret = raw_open_common(bs, options, flags, 0, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; } static int raw_reopen_prepare(BDRVReopenState *state, @@ -365,6 +493,7 @@ static int raw_reopen_prepare(BDRVReopenState *state, * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio() * won't override aio_ctx if aio_ctx is non-NULL */ if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) { + error_setg(errp, "Could not set AIO state"); return -1; } #endif @@ -416,13 +545,13 @@ static int raw_reopen_prepare(BDRVReopenState *state, assert(!(raw_s->open_flags & O_CREAT)); raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags); if (raw_s->fd == -1) { + error_setg_errno(errp, errno, "Could not reopen file"); ret = -1; } } return ret; } - static void raw_reopen_commit(BDRVReopenState *state) { BDRVRawReopenState *raw_s = state->opaque; @@ -458,23 +587,15 @@ static void raw_reopen_abort(BDRVReopenState *state) state->opaque = NULL; } +static int raw_refresh_limits(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; -/* XXX: use host sector size if necessary with: -#ifdef DIOCGSECTORSIZE - { - unsigned int sectorsize = 512; - if (!ioctl(fd, DIOCGSECTORSIZE, §orsize) && - sectorsize > bufsize) - bufsize = sectorsize; - } -#endif -#ifdef CONFIG_COCOA - uint32_t blockSize = 512; - if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) { - bufsize = blockSize; - } -#endif -*/ + raw_probe_alignment(bs); + bs->bl.opt_mem_alignment = s->buf_align; + + return 0; +} static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) { @@ -665,6 +786,23 @@ static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) } #ifdef CONFIG_XFS +static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ + struct xfs_flock64 fl; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = offset; + fl.l_len = bytes; + + if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { + DEBUG_BLOCK_PRINT("cannot write zero range (%s)\n", strerror(errno)); + return -errno; + } + + return 0; +} + static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) { struct xfs_flock64 fl; @@ -683,13 +821,49 @@ static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) } #endif +static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +{ + int ret = -EOPNOTSUPP; + BDRVRawState *s = aiocb->bs->opaque; + + if (s->has_write_zeroes == 0) { + return -ENOTSUP; + } + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { +#ifdef BLKZEROOUT + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + } else { +#ifdef CONFIG_XFS + if (s->is_xfs) { + return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + } + + if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || + ret == -ENOTTY) { + s->has_write_zeroes = false; + ret = -ENOTSUP; + } + return ret; +} + static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) { int ret = -EOPNOTSUPP; BDRVRawState *s = aiocb->bs->opaque; - if (s->has_discard == 0) { - return 0; + if (!s->has_discard) { + return -ENOTSUP; } if (aiocb->aio_type & QEMU_AIO_BLKDEV) { @@ -724,8 +898,8 @@ static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) if (ret == -ENODEV || ret == -ENOSYS || ret == -EOPNOTSUPP || ret == -ENOTTY) { - s->has_discard = 0; - ret = 0; + s->has_discard = false; + ret = -ENOTSUP; } return ret; } @@ -767,6 +941,9 @@ static int aio_worker(void *arg) case QEMU_AIO_DISCARD: ret = handle_aiocb_discard(aiocb); break; + case QEMU_AIO_WRITE_ZEROES: + ret = handle_aiocb_write_zeroes(aiocb); + break; default: fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); ret = -EINVAL; @@ -777,6 +954,29 @@ static int aio_worker(void *arg) return ret; } +static int paio_submit_co(BlockDriverState *bs, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + int type) +{ + RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + ThreadPool *pool; + + acb->bs = bs; + acb->aio_type = type; + acb->aio_fildes = fd; + + if (qiov) { + acb->aio_iov = qiov->iov; + acb->aio_niov = qiov->niov; + } + acb->aio_nbytes = nb_sectors * 512; + acb->aio_offset = sector_num * 512; + + trace_paio_submit_co(sector_num, nb_sectors, type); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_co(pool, aio_worker, acb); +} + static BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque, int type) @@ -1040,12 +1240,15 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return (int64_t)st.st_blocks * 512; } -static int raw_create(const char *filename, QEMUOptionParameter *options) +static int raw_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int result = 0; int64_t total_size = 0; + strstart(filename, "file:", &filename); + /* Read out options */ while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { @@ -1058,63 +1261,43 @@ static int raw_create(const char *filename, QEMUOptionParameter *options) 0644); if (fd < 0) { result = -errno; + error_setg_errno(errp, -result, "Could not create file"); } else { if (ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { result = -errno; + error_setg_errno(errp, -result, "Could not resize file"); } if (qemu_close(fd) != 0) { result = -errno; + error_setg_errno(errp, -result, "Could not close the new file"); } } return result; } -/* - * Returns true iff the specified sector is present in the disk image. Drivers - * not implementing the functionality are assumed to not support backing files, - * hence all their sectors are reported as allocated. - * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. - */ -static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum) +static int64_t try_fiemap(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole, int nb_sectors, int *pnum) { - off_t start, data, hole; - int ret; - - ret = fd_open(bs); - if (ret < 0) { - return ret; - } - - start = sector_num * BDRV_SECTOR_SIZE; - #ifdef CONFIG_FIEMAP - BDRVRawState *s = bs->opaque; + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; struct { struct fiemap fm; struct fiemap_extent fe; } f; + if (s->skip_fiemap) { + return -ENOTSUP; + } + f.fm.fm_start = start; f.fm.fm_length = (int64_t)nb_sectors * BDRV_SECTOR_SIZE; f.fm.fm_flags = 0; f.fm.fm_extent_count = 1; f.fm.fm_reserved = 0; if (ioctl(s->fd, FS_IOC_FIEMAP, &f) == -1) { - /* Assume everything is allocated. */ - *pnum = nb_sectors; - return 1; + s->skip_fiemap = true; + return -errno; } if (f.fm.fm_mapped_extents == 0) { @@ -1122,51 +1305,104 @@ static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, * f.fm.fm_start + f.fm.fm_length must be clamped to the file size! */ off_t length = lseek(s->fd, 0, SEEK_END); - hole = f.fm.fm_start; - data = MIN(f.fm.fm_start + f.fm.fm_length, length); + *hole = f.fm.fm_start; + *data = MIN(f.fm.fm_start + f.fm.fm_length, length); } else { - data = f.fe.fe_logical; - hole = f.fe.fe_logical + f.fe.fe_length; + *data = f.fe.fe_logical; + *hole = f.fe.fe_logical + f.fe.fe_length; + if (f.fe.fe_flags & FIEMAP_EXTENT_UNWRITTEN) { + ret |= BDRV_BLOCK_ZERO; + } } -#elif defined SEEK_HOLE && defined SEEK_DATA + return ret; +#else + return -ENOTSUP; +#endif +} +static int64_t try_seek_hole(BlockDriverState *bs, off_t start, off_t *data, + off_t *hole, int *pnum) +{ +#if defined SEEK_HOLE && defined SEEK_DATA BDRVRawState *s = bs->opaque; - hole = lseek(s->fd, start, SEEK_HOLE); - if (hole == -1) { + *hole = lseek(s->fd, start, SEEK_HOLE); + if (*hole == -1) { /* -ENXIO indicates that sector_num was past the end of the file. * There is a virtual hole there. */ assert(errno != -ENXIO); - /* Most likely EINVAL. Assume everything is allocated. */ - *pnum = nb_sectors; - return 1; + return -errno; } - if (hole > start) { - data = start; + if (*hole > start) { + *data = start; } else { /* On a hole. We need another syscall to find its end. */ - data = lseek(s->fd, start, SEEK_DATA); - if (data == -1) { - data = lseek(s->fd, 0, SEEK_END); + *data = lseek(s->fd, start, SEEK_DATA); + if (*data == -1) { + *data = lseek(s->fd, 0, SEEK_END); } } + + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; #else - *pnum = nb_sectors; - return 1; + return -ENOTSUP; #endif +} + +/* + * Returns true iff the specified sector is present in the disk image. Drivers + * not implementing the functionality are assumed to not support backing files, + * hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + off_t start, data = 0, hole = 0; + int64_t ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + start = sector_num * BDRV_SECTOR_SIZE; + + ret = try_fiemap(bs, start, &data, &hole, nb_sectors, pnum); + if (ret < 0) { + ret = try_seek_hole(bs, start, &data, &hole, pnum); + if (ret < 0) { + /* Assume everything is allocated. */ + data = 0; + hole = start + nb_sectors * BDRV_SECTOR_SIZE; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + } + } if (data <= start) { /* On a data extent, compute sectors to the end of the extent. */ *pnum = MIN(nb_sectors, (hole - start) / BDRV_SECTOR_SIZE); - return 1; } else { /* On a hole, compute sectors to the beginning of the next extent. */ *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); - return 0; + ret &= ~BDRV_BLOCK_DATA; + ret |= BDRV_BLOCK_ZERO; } + + return ret; } static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs, @@ -1179,6 +1415,31 @@ static coroutine_fn BlockDriverAIOCB *raw_aio_discard(BlockDriverState *bs, cb, opaque, QEMU_AIO_DISCARD); } +static int coroutine_fn raw_co_write_zeroes( + BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + BDRVRawState *s = bs->opaque; + + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_WRITE_ZEROES); + } else if (s->discard_zeroes) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_DISCARD); + } + return -ENOTSUP; +} + +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVRawState *s = bs->opaque; + + bdi->unallocated_blocks_are_zero = s->discard_zeroes; + bdi->can_write_zeroes_with_unmap = s->discard_zeroes; + return 0; +} + static QEMUOptionParameter raw_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1192,7 +1453,9 @@ static BlockDriver bdrv_file = { .format_name = "file", .protocol_name = "file", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe = NULL, /* no probe for protocols */ + .bdrv_parse_filename = raw_parse_filename, .bdrv_file_open = raw_open, .bdrv_reopen_prepare = raw_reopen_prepare, .bdrv_reopen_commit = raw_reopen_commit, @@ -1200,15 +1463,18 @@ static BlockDriver bdrv_file = { .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = raw_co_is_allocated, + .bdrv_co_get_block_status = raw_co_get_block_status, + .bdrv_co_write_zeroes = raw_co_write_zeroes, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = raw_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1325,9 +1591,20 @@ static int check_hdev_writable(BDRVRawState *s) return 0; } -static int hdev_open(BlockDriverState *bs, QDict *options, int flags) +static void hdev_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_device:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; int ret; const char *filename = qdict_get_str(options, "filename"); @@ -1371,8 +1648,11 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags) } #endif - ret = raw_open_common(bs, options, flags, 0); + ret = raw_open_common(bs, options, flags, 0, &local_err); if (ret < 0) { + if (local_err) { + error_propagate(errp, local_err); + } return ret; } @@ -1380,6 +1660,7 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags) ret = check_hdev_writable(s); if (ret < 0) { raw_close(bs); + error_setg_errno(errp, -ret, "The device is not writable"); return ret; } } @@ -1498,12 +1779,45 @@ static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs, cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); } -static int hdev_create(const char *filename, QEMUOptionParameter *options) +static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BDRVRawState *s = bs->opaque; + int rc; + + rc = fd_open(bs); + if (rc < 0) { + return rc; + } + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); + } else if (s->discard_zeroes) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); + } + return -ENOTSUP; +} + +static int hdev_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int ret = 0; struct stat stat_buf; int64_t total_size = 0; + bool has_prefix; + + /* This function is used by all three protocol block drivers and therefore + * any of these three prefixes may be given. + * The return value has to be stored somewhere, otherwise this is an error + * due to -Werror=unused-value. */ + has_prefix = + strstart(filename, "host_device:", &filename) || + strstart(filename, "host_cdrom:" , &filename) || + strstart(filename, "host_floppy:", &filename); + + (void)has_prefix; /* Read out options */ while (options && options->name) { @@ -1514,15 +1828,23 @@ static int hdev_create(const char *filename, QEMUOptionParameter *options) } fd = qemu_open(filename, O_WRONLY | O_BINARY); - if (fd < 0) - return -errno; + if (fd < 0) { + ret = -errno; + error_setg_errno(errp, -ret, "Could not open device"); + return ret; + } - if (fstat(fd, &stat_buf) < 0) + if (fstat(fd, &stat_buf) < 0) { ret = -errno; - else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) + error_setg_errno(errp, -ret, "Could not stat device"); + } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { + error_setg(errp, + "The given file is neither a block nor a character device"); ret = -ENODEV; - else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) + } else if (lseek(fd, 0, SEEK_END) < total_size * BDRV_SECTOR_SIZE) { + error_setg(errp, "Device is too small"); ret = -ENOSPC; + } qemu_close(fd); return ret; @@ -1532,7 +1854,9 @@ static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = hdev_probe_device, + .bdrv_parse_filename = hdev_parse_filename, .bdrv_file_open = hdev_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1540,14 +1864,17 @@ static BlockDriver bdrv_host_device = { .bdrv_reopen_abort = raw_reopen_abort, .bdrv_create = hdev_create, .create_options = raw_create_options, + .bdrv_co_write_zeroes = hdev_co_write_zeroes, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_discard = hdev_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1559,17 +1886,32 @@ static BlockDriver bdrv_host_device = { }; #ifdef __linux__ -static int floppy_open(BlockDriverState *bs, QDict *options, int flags) +static void floppy_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_floppy:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int floppy_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; int ret; s->type = FTYPE_FD; /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */ - ret = raw_open_common(bs, options, flags, O_NONBLOCK); - if (ret) + ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); + if (ret) { + if (local_err) { + error_propagate(errp, local_err); + } return ret; + } /* close fd so that we can reopen it as needed */ qemu_close(s->fd); @@ -1656,7 +1998,9 @@ static BlockDriver bdrv_host_floppy = { .format_name = "host_floppy", .protocol_name = "host_floppy", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = floppy_probe_device, + .bdrv_parse_filename = floppy_parse_filename, .bdrv_file_open = floppy_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1668,9 +2012,11 @@ static BlockDriver bdrv_host_floppy = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1679,15 +2025,35 @@ static BlockDriver bdrv_host_floppy = { .bdrv_media_changed = floppy_media_changed, .bdrv_eject = floppy_eject, }; +#endif + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static void cdrom_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_cdrom:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} +#endif -static int cdrom_open(BlockDriverState *bs, QDict *options, int flags) +#ifdef __linux__ +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; s->type = FTYPE_CD; /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ - return raw_open_common(bs, options, flags, O_NONBLOCK); + ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; } static int cdrom_probe_device(const char *filename) @@ -1757,7 +2123,9 @@ static BlockDriver bdrv_host_cdrom = { .format_name = "host_cdrom", .protocol_name = "host_cdrom", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, .bdrv_file_open = cdrom_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1769,9 +2137,11 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, @@ -1787,16 +2157,22 @@ static BlockDriver bdrv_host_cdrom = { #endif /* __linux__ */ #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) -static int cdrom_open(BlockDriverState *bs, QDict *options, int flags) +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; + Error *local_err = NULL; int ret; s->type = FTYPE_CD; - ret = raw_open_common(bs, options, flags, 0); - if (ret) + ret = raw_open_common(bs, options, flags, 0, &local_err); + if (ret) { + if (local_err) { + error_propagate(errp, local_err); + } return ret; + } /* make sure the door isn't locked at this time */ ioctl(s->fd, CDIOCALLOW); @@ -1878,7 +2254,9 @@ static BlockDriver bdrv_host_cdrom = { .format_name = "host_cdrom", .protocol_name = "host_cdrom", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, .bdrv_file_open = cdrom_open, .bdrv_close = raw_close, .bdrv_reopen_prepare = raw_reopen_prepare, @@ -1890,9 +2268,11 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, .bdrv_get_allocated_file_size = raw_get_allocated_file_size, diff --git a/block/raw-win32.c b/block/raw-win32.c index 9b5b2af4e8..064ea3123c 100644 --- a/block/raw-win32.c +++ b/block/raw-win32.c @@ -85,6 +85,7 @@ static size_t handle_aiocb_rw(RawWin32AIOData *aiocb) ret_count = 0; } if (ret_count != len) { + offset += ret_count; break; } offset += len; @@ -201,6 +202,35 @@ static int set_sparse(int fd) NULL, 0, NULL, 0, &returned, NULL); } +static void raw_probe_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + DWORD sectorsPerCluster, freeClusters, totalClusters, count; + DISK_GEOMETRY_EX dg; + BOOL status; + + if (s->type == FTYPE_CD) { + bs->request_alignment = 2048; + return; + } + if (s->type == FTYPE_HARDDISK) { + status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &count, NULL); + if (status != 0) { + bs->request_alignment = dg.Geometry.BytesPerSector; + return; + } + /* try GetDiskFreeSpace too */ + } + + if (s->drive_path[0]) { + GetDiskFreeSpace(s->drive_path, §orsPerCluster, + &dg.Geometry.BytesPerSector, + &freeClusters, &totalClusters); + bs->request_alignment = dg.Geometry.BytesPerSector; + } +} + static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) { assert(access_flags != NULL); @@ -221,6 +251,17 @@ static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) } } +static void raw_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The filename does not have to be prefixed by the protocol name, since + * "file" is the default protocol; therefore, the return value of this + * function call can be ignored. */ + strstart(filename, "file:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + static QemuOptsList raw_runtime_opts = { .name = "raw", .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), @@ -234,7 +275,8 @@ static QemuOptsList raw_runtime_opts = { }, }; -static int raw_open(BlockDriverState *bs, QDict *options, int flags) +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; int access_flags; @@ -246,11 +288,10 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags) s->type = FTYPE_FILE; - opts = qemu_opts_create_nofail(&raw_runtime_opts); + opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } @@ -262,11 +303,23 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags) if ((flags & BDRV_O_NATIVE_AIO) && aio == NULL) { aio = win32_aio_init(); if (aio == NULL) { + error_setg(errp, "Could not initialize AIO"); ret = -EINVAL; goto fail; } } + if (filename[0] && filename[1] == ':') { + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]); + } else if (filename[0] == '\\' && filename[1] == '\\') { + s->drive_path[0] = 0; + } else { + /* Relative path. */ + char buf[MAX_PATH]; + GetCurrentDirectory(MAX_PATH, buf); + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]); + } + s->hfile = CreateFile(filename, access_flags, FILE_SHARE_READ, NULL, OPEN_EXISTING, overlapped, NULL); @@ -285,11 +338,13 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags) ret = win32_aio_attach(aio, s->hfile); if (ret < 0) { CloseHandle(s->hfile); + error_setg_errno(errp, -ret, "Could not enable AIO"); goto fail; } s->aio = aio; } + raw_probe_alignment(bs); ret = 0; fail: qemu_opts_del(opts); @@ -335,6 +390,9 @@ static void raw_close(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; CloseHandle(s->hfile); + if (bs->open_flags & BDRV_O_TEMPORARY) { + unlink(bs->filename); + } } static int raw_truncate(BlockDriverState *bs, int64_t offset) @@ -420,11 +478,14 @@ static int64_t raw_get_allocated_file_size(BlockDriverState *bs) return st.st_size; } -static int raw_create(const char *filename, QEMUOptionParameter *options) +static int raw_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int64_t total_size = 0; + strstart(filename, "file:", &filename); + /* Read out options */ while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { @@ -435,8 +496,10 @@ static int raw_create(const char *filename, QEMUOptionParameter *options) fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); - if (fd < 0) + if (fd < 0) { + error_setg_errno(errp, errno, "Could not create file"); return -EIO; + } set_sparse(fd); ftruncate(fd, total_size * 512); qemu_close(fd); @@ -456,6 +519,8 @@ static BlockDriver bdrv_file = { .format_name = "file", .protocol_name = "file", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = raw_parse_filename, .bdrv_file_open = raw_open, .bdrv_close = raw_close, .bdrv_create = raw_create, @@ -531,17 +596,44 @@ static int hdev_probe_device(const char *filename) return 0; } -static int hdev_open(BlockDriverState *bs, QDict *options, int flags) +static void hdev_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_device:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRawState *s = bs->opaque; int access_flags, create_flags; + int ret = 0; DWORD overlapped; char device_name[64]; - const char *filename = qdict_get_str(options, "filename"); + + Error *local_err = NULL; + const char *filename; + + QemuOpts *opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, + &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto done; + } + + filename = qemu_opt_get(opts, "filename"); if (strstart(filename, "/dev/cdrom", NULL)) { - if (find_cdrom(device_name, sizeof(device_name)) < 0) - return -ENOENT; + if (find_cdrom(device_name, sizeof(device_name)) < 0) { + error_setg(errp, "Could not open CD-ROM drive"); + ret = -ENOENT; + goto done; + } filename = device_name; } else { /* transform drive letters into device name */ @@ -564,17 +656,26 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags) if (s->hfile == INVALID_HANDLE_VALUE) { int err = GetLastError(); - if (err == ERROR_ACCESS_DENIED) - return -EACCES; - return -1; + if (err == ERROR_ACCESS_DENIED) { + ret = -EACCES; + } else { + ret = -EINVAL; + } + error_setg_errno(errp, -ret, "Could not open device"); + goto done; } - return 0; + +done: + qemu_opts_del(opts); + return ret; } static BlockDriver bdrv_host_device = { .format_name = "host_device", .protocol_name = "host_device", .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = hdev_parse_filename, .bdrv_probe_device = hdev_probe_device, .bdrv_file_open = hdev_open, .bdrv_close = raw_close, @@ -583,7 +684,9 @@ static BlockDriver bdrv_host_device = { .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, - .bdrv_getlength = raw_getlength, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, + .bdrv_get_allocated_file_size = raw_get_allocated_file_size, }; diff --git a/block/raw.c b/block/raw_bsd.c similarity index 51% rename from block/raw.c rename to block/raw_bsd.c index 47518253fe..01ea692a46 100644 --- a/block/raw.c +++ b/block/raw_bsd.c @@ -1,13 +1,17 @@ -/* - * Block driver for RAW format +/* BlockDriver implementation for "raw" * - * Copyright (c) 2006 Fabrice Bellard + * Copyright (C) 2010, 2013, Red Hat, Inc. + * Copyright (C) 2010, Blue Swirl + * Copyright (C) 2009, Anthony Liguori + * + * Author: + * Laszlo Ersek * * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in @@ -15,27 +19,27 @@ * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. */ -#include "qemu-common.h" #include "block/block_int.h" -#include "qemu/module.h" +#include "qemu/option.h" -static int raw_open(BlockDriverState *bs, QDict *options, int flags) -{ - bs->sg = bs->file->sg; - return 0; -} +static QEMUOptionParameter raw_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size" + }, + { 0 } +}; -/* We have nothing to do for raw reopen, stubs just return - * success */ -static int raw_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) +static int raw_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) { return 0; } @@ -54,22 +58,26 @@ static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, return bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov); } -static void raw_close(BlockDriverState *bs) -{ -} - -static int coroutine_fn raw_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { - return bdrv_co_is_allocated(bs->file, sector_num, nb_sectors, pnum); + *pnum = nb_sectors; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | + (sector_num << BDRV_SECTOR_BITS); } static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors) + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) { - return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors); + return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors, flags); +} + +static int coroutine_fn raw_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + return bdrv_co_discard(bs->file, sector_num, nb_sectors); } static int64_t raw_getlength(BlockDriverState *bs) @@ -77,20 +85,20 @@ static int64_t raw_getlength(BlockDriverState *bs) return bdrv_getlength(bs->file); } -static int raw_truncate(BlockDriverState *bs, int64_t offset) +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { - return bdrv_truncate(bs->file, offset); + return bdrv_get_info(bs->file, bdi); } -static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +static int raw_refresh_limits(BlockDriverState *bs) { - return 1; /* everything can be opened as raw image */ + bs->bl = bs->file->bl; + return 0; } -static int coroutine_fn raw_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) +static int raw_truncate(BlockDriverState *bs, int64_t offset) { - return bdrv_co_discard(bs->file, sector_num, nb_sectors); + return bdrv_truncate(bs->file, offset); } static int raw_is_inserted(BlockDriverState *bs) @@ -115,73 +123,79 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked) static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) { - return bdrv_ioctl(bs->file, req, buf); + return bdrv_ioctl(bs->file, req, buf); } static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockDriverCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); -} - -static int raw_create(const char *filename, QEMUOptionParameter *options) + unsigned long int req, void *buf, + BlockDriverCompletionFunc *cb, + void *opaque) { - return bdrv_create_file(filename, options); + return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); } -static QEMUOptionParameter raw_create_options[] = { - { - .name = BLOCK_OPT_SIZE, - .type = OPT_SIZE, - .help = "Virtual disk size" - }, - { NULL } -}; - static int raw_has_zero_init(BlockDriverState *bs) { return bdrv_has_zero_init(bs->file); } -static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +static int raw_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { - return bdrv_get_info(bs->file, bdi); -} - -static BlockDriver bdrv_raw = { - .format_name = "raw", - - /* It's really 0, but we need to make g_malloc() happy */ - .instance_size = 1, - - .bdrv_open = raw_open, - .bdrv_close = raw_close, - - .bdrv_reopen_prepare = raw_reopen_prepare, + Error *local_err = NULL; + int ret; - .bdrv_co_readv = raw_co_readv, - .bdrv_co_writev = raw_co_writev, - .bdrv_co_is_allocated = raw_co_is_allocated, - .bdrv_co_write_zeroes = raw_co_write_zeroes, - .bdrv_co_discard = raw_co_discard, + ret = bdrv_create_file(filename, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; +} - .bdrv_probe = raw_probe, - .bdrv_getlength = raw_getlength, - .bdrv_get_info = raw_get_info, - .bdrv_truncate = raw_truncate, +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + bs->sg = bs->file->sg; + return 0; +} - .bdrv_is_inserted = raw_is_inserted, - .bdrv_media_changed = raw_media_changed, - .bdrv_eject = raw_eject, - .bdrv_lock_medium = raw_lock_medium, +static void raw_close(BlockDriverState *bs) +{ +} - .bdrv_ioctl = raw_ioctl, - .bdrv_aio_ioctl = raw_aio_ioctl, +static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + /* smallest possible positive score so that raw is used if and only if no + * other block driver works + */ + return 1; +} - .bdrv_create = raw_create, - .create_options = raw_create_options, - .bdrv_has_zero_init = raw_has_zero_init, +static BlockDriver bdrv_raw = { + .format_name = "raw", + .bdrv_probe = &raw_probe, + .bdrv_reopen_prepare = &raw_reopen_prepare, + .bdrv_open = &raw_open, + .bdrv_close = &raw_close, + .bdrv_create = &raw_create, + .bdrv_co_readv = &raw_co_readv, + .bdrv_co_writev = &raw_co_writev, + .bdrv_co_write_zeroes = &raw_co_write_zeroes, + .bdrv_co_discard = &raw_co_discard, + .bdrv_co_get_block_status = &raw_co_get_block_status, + .bdrv_truncate = &raw_truncate, + .bdrv_getlength = &raw_getlength, + .has_variable_length = true, + .bdrv_get_info = &raw_get_info, + .bdrv_refresh_limits = &raw_refresh_limits, + .bdrv_is_inserted = &raw_is_inserted, + .bdrv_media_changed = &raw_media_changed, + .bdrv_eject = &raw_eject, + .bdrv_lock_medium = &raw_lock_medium, + .bdrv_ioctl = &raw_ioctl, + .bdrv_aio_ioctl = &raw_aio_ioctl, + .create_options = &raw_create_options[0], + .bdrv_has_zero_init = &raw_has_zero_init }; static void bdrv_raw_init(void) diff --git a/block/rbd.c b/block/rbd.c index cb71751218..dbc79f4525 100644 --- a/block/rbd.c +++ b/block/rbd.c @@ -95,19 +95,13 @@ typedef struct RADOSCB { #define RBD_FD_WRITE 1 typedef struct BDRVRBDState { - int fds[2]; rados_t cluster; rados_ioctx_t io_ctx; rbd_image_t image; char name[RBD_MAX_IMAGE_NAME_SIZE]; - int qemu_aio_count; char *snap; - int event_reader_pos; - RADOSCB *event_rcb; } BDRVRBDState; -static void rbd_aio_bh_cb(void *opaque); - static int qemu_rbd_next_tok(char *dst, int dst_len, char *src, char delim, const char *name, @@ -288,7 +282,8 @@ static int qemu_rbd_set_conf(rados_t cluster, const char *conf) return ret; } -static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options) +static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int64_t bytes = 0; int64_t objsize; @@ -369,9 +364,8 @@ static int qemu_rbd_create(const char *filename, QEMUOptionParameter *options) } /* - * This aio completion is being called from qemu_rbd_aio_event_reader() - * and runs in qemu context. It schedules a bh, but just in case the aio - * was not cancelled before. + * This aio completion is being called from rbd_finish_bh() and runs in qemu + * BH context. */ static void qemu_rbd_complete_aio(RADOSCB *rcb) { @@ -401,44 +395,19 @@ static void qemu_rbd_complete_aio(RADOSCB *rcb) acb->ret = r; } } - /* Note that acb->bh can be NULL in case where the aio was cancelled */ - acb->bh = qemu_bh_new(rbd_aio_bh_cb, acb); - qemu_bh_schedule(acb->bh); - g_free(rcb); -} - -/* - * aio fd read handler. It runs in the qemu context and calls the - * completion handling of completed rados aio operations. - */ -static void qemu_rbd_aio_event_reader(void *opaque) -{ - BDRVRBDState *s = opaque; - - ssize_t ret; - do { - char *p = (char *)&s->event_rcb; - - /* now read the rcb pointer that was sent from a non qemu thread */ - ret = read(s->fds[RBD_FD_READ], p + s->event_reader_pos, - sizeof(s->event_rcb) - s->event_reader_pos); - if (ret > 0) { - s->event_reader_pos += ret; - if (s->event_reader_pos == sizeof(s->event_rcb)) { - s->event_reader_pos = 0; - qemu_rbd_complete_aio(s->event_rcb); - s->qemu_aio_count--; - } - } - } while (ret < 0 && errno == EINTR); -} + g_free(rcb); -static int qemu_rbd_aio_flush_cb(void *opaque) -{ - BDRVRBDState *s = opaque; + if (acb->cmd == RBD_AIO_READ) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); + acb->status = 0; - return (s->qemu_aio_count > 0); + if (!acb->cancelled) { + qemu_aio_release(acb); + } } /* TODO Convert to fine grained options */ @@ -455,7 +424,8 @@ static QemuOptsList runtime_opts = { }, }; -static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) +static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVRBDState *s = bs->opaque; char pool[RBD_MAX_POOL_NAME_SIZE]; @@ -468,9 +438,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) const char *filename; int r; - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { qerror_report_err(local_err); error_free(local_err); qemu_opts_del(opts); @@ -545,23 +515,9 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags) bs->read_only = (s->snap != NULL); - s->event_reader_pos = 0; - r = qemu_pipe(s->fds); - if (r < 0) { - error_report("error opening eventfd"); - goto failed; - } - fcntl(s->fds[0], F_SETFL, O_NONBLOCK); - fcntl(s->fds[1], F_SETFL, O_NONBLOCK); - qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], qemu_rbd_aio_event_reader, - NULL, qemu_rbd_aio_flush_cb, s); - - qemu_opts_del(opts); return 0; -failed: - rbd_close(s->image); failed_open: rados_ioctx_destroy(s->io_ctx); failed_shutdown: @@ -576,10 +532,6 @@ static void qemu_rbd_close(BlockDriverState *bs) { BDRVRBDState *s = bs->opaque; - close(s->fds[0]); - close(s->fds[1]); - qemu_aio_set_fd_handler(s->fds[RBD_FD_READ], NULL, NULL, NULL, NULL); - rbd_close(s->image); rados_ioctx_destroy(s->io_ctx); g_free(s->snap); @@ -607,34 +559,11 @@ static const AIOCBInfo rbd_aiocb_info = { .cancel = qemu_rbd_aio_cancel, }; -static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) +static void rbd_finish_bh(void *opaque) { - int ret = 0; - while (1) { - fd_set wfd; - int fd = s->fds[RBD_FD_WRITE]; - - /* send the op pointer to the qemu thread that is responsible - for the aio/op completion. Must do it in a qemu thread context */ - ret = write(fd, (void *)&rcb, sizeof(rcb)); - if (ret >= 0) { - break; - } - if (errno == EINTR) { - continue; - } - if (errno != EAGAIN) { - break; - } - - FD_ZERO(&wfd); - FD_SET(fd, &wfd); - do { - ret = select(fd + 1, NULL, &wfd, NULL, NULL); - } while (ret < 0 && errno == EINTR); - } - - return ret; + RADOSCB *rcb = opaque; + qemu_bh_delete(rcb->acb->bh); + qemu_rbd_complete_aio(rcb); } /* @@ -642,40 +571,18 @@ static int qemu_rbd_send_pipe(BDRVRBDState *s, RADOSCB *rcb) * * Note: this function is being called from a non qemu thread so * we need to be careful about what we do here. Generally we only - * write to the block notification pipe, and do the rest of the - * io completion handling from qemu_rbd_aio_event_reader() which - * runs in a qemu context. + * schedule a BH, and do the rest of the io completion handling + * from rbd_finish_bh() which runs in a qemu context. */ static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) { - int ret; + RBDAIOCB *acb = rcb->acb; + rcb->ret = rbd_aio_get_return_value(c); rbd_aio_release(c); - ret = qemu_rbd_send_pipe(rcb->s, rcb); - if (ret < 0) { - error_report("failed writing to acb->s->fds"); - g_free(rcb); - } -} - -/* Callback when all queued rbd_aio requests are complete */ - -static void rbd_aio_bh_cb(void *opaque) -{ - RBDAIOCB *acb = opaque; - if (acb->cmd == RBD_AIO_READ) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - acb->status = 0; - - if (!acb->cancelled) { - qemu_aio_release(acb); - } + acb->bh = qemu_bh_new(rbd_finish_bh, rcb); + qemu_bh_schedule(acb->bh); } static int rbd_aio_discard_wrapper(rbd_image_t image, @@ -741,8 +648,6 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, off = sector_num * BDRV_SECTOR_SIZE; size = nb_sectors * BDRV_SECTOR_SIZE; - s->qemu_aio_count++; /* All the RADOSCB */ - rcb = g_malloc(sizeof(RADOSCB)); rcb->done = 0; rcb->acb = acb; @@ -779,7 +684,6 @@ static BlockDriverAIOCB *rbd_start_aio(BlockDriverState *bs, failed: g_free(rcb); - s->qemu_aio_count--; qemu_aio_release(acb); return NULL; } @@ -903,12 +807,31 @@ static int qemu_rbd_snap_create(BlockDriverState *bs, } static int qemu_rbd_snap_remove(BlockDriverState *bs, - const char *snapshot_name) + const char *snapshot_id, + const char *snapshot_name, + Error **errp) { BDRVRBDState *s = bs->opaque; int r; + if (!snapshot_name) { + error_setg(errp, "rbd need a valid snapshot name"); + return -EINVAL; + } + + /* If snapshot_id is specified, it must be equal to name, see + qemu_rbd_snap_list() */ + if (snapshot_id && strcmp(snapshot_id, snapshot_name)) { + error_setg(errp, + "rbd do not support snapshot id, it should be NULL or " + "equal to snapshot name"); + return -EINVAL; + } + r = rbd_snap_remove(s->image, snapshot_name); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to remove the snapshot"); + } return r; } @@ -934,7 +857,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, do { snaps = g_malloc(sizeof(*snaps) * max_snaps); snap_count = rbd_snap_list(s->image, snaps, &max_snaps); - if (snap_count < 0) { + if (snap_count <= 0) { g_free(snaps); } } while (snap_count == -ERANGE); @@ -958,6 +881,7 @@ static int qemu_rbd_snap_list(BlockDriverState *bs, sn_info->vm_clock_nsec = 0; } rbd_snap_list_end(snaps); + g_free(snaps); done: *psn_tab = sn_tab; @@ -993,6 +917,7 @@ static QEMUOptionParameter qemu_rbd_create_options[] = { static BlockDriver bdrv_rbd = { .format_name = "rbd", .instance_size = sizeof(BDRVRBDState), + .bdrv_needs_filename = true, .bdrv_file_open = qemu_rbd_open, .bdrv_close = qemu_rbd_close, .bdrv_create = qemu_rbd_create, diff --git a/block/sheepdog.c b/block/sheepdog.c index afe053376c..2c3fb016a8 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -91,6 +91,14 @@ #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) +/* + * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and + * (SD_EC_MAX_STRIP - 1) for parity strips + * + * SD_MAX_COPIES is sum of number of data strips and parity strips. + */ +#define SD_EC_MAX_STRIP 16 +#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1) #define SD_INODE_SIZE (sizeof(SheepdogInode)) #define CURRENT_VDI_ID 0 @@ -125,8 +133,9 @@ typedef struct SheepdogObjReq { uint32_t data_length; uint64_t oid; uint64_t cow_oid; - uint32_t copies; - uint32_t rsvd; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[6]; uint64_t offset; } SheepdogObjReq; @@ -138,7 +147,9 @@ typedef struct SheepdogObjRsp { uint32_t id; uint32_t data_length; uint32_t result; - uint32_t copies; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[2]; uint32_t pad[6]; } SheepdogObjRsp; @@ -150,8 +161,10 @@ typedef struct SheepdogVdiReq { uint32_t id; uint32_t data_length; uint64_t vdi_size; - uint32_t vdi_id; - uint32_t copies; + uint32_t base_vdi_id; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[2]; uint32_t snapid; uint32_t pad[3]; } SheepdogVdiReq; @@ -222,6 +235,11 @@ static inline uint64_t data_oid_to_idx(uint64_t oid) return oid & (MAX_DATA_OBJS - 1); } +static inline uint32_t oid_to_vid(uint64_t oid) +{ + return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT; +} + static inline uint64_t vid_to_vdi_oid(uint32_t vid) { return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); @@ -289,11 +307,14 @@ struct SheepdogAIOCB { Coroutine *coroutine; void (*aio_done_func)(SheepdogAIOCB *); - bool canceled; + bool cancelable; + bool *finished; int nr_pending; }; typedef struct BDRVSheepdogState { + BlockDriverState *bs; + SheepdogInode inode; uint32_t min_dirty_data_idx; @@ -313,8 +334,11 @@ typedef struct BDRVSheepdogState { Coroutine *co_recv; uint32_t aioreq_seq_num; + + /* Every aio request must be linked to either of these queues. */ QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; QLIST_HEAD(pending_aio_head, AIOReq) pending_aio_head; + QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; } BDRVSheepdogState; static const char * sd_strerror(int err) @@ -403,6 +427,7 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) { SheepdogAIOCB *acb = aio_req->aiocb; + acb->cancelable = false; QLIST_REMOVE(aio_req, aio_siblings); g_free(aio_req); @@ -411,23 +436,68 @@ static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) { - if (!acb->canceled) { - qemu_coroutine_enter(acb->coroutine, NULL); + qemu_coroutine_enter(acb->coroutine, NULL); + if (acb->finished) { + *acb->finished = true; } qemu_aio_release(acb); } +/* + * Check whether the specified acb can be canceled + * + * We can cancel aio when any request belonging to the acb is: + * - Not processed by the sheepdog server. + * - Not linked to the inflight queue. + */ +static bool sd_acb_cancelable(const SheepdogAIOCB *acb) +{ + BDRVSheepdogState *s = acb->common.bs->opaque; + AIOReq *aioreq; + + if (!acb->cancelable) { + return false; + } + + QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) { + if (aioreq->aiocb == acb) { + return false; + } + } + + return true; +} + static void sd_aio_cancel(BlockDriverAIOCB *blockacb) { SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb; + BDRVSheepdogState *s = acb->common.bs->opaque; + AIOReq *aioreq, *next; + bool finished = false; + + acb->finished = &finished; + while (!finished) { + if (sd_acb_cancelable(acb)) { + /* Remove outstanding requests from pending and failed queues. */ + QLIST_FOREACH_SAFE(aioreq, &s->pending_aio_head, aio_siblings, + next) { + if (aioreq->aiocb == acb) { + free_aio_req(s, aioreq); + } + } + QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, + next) { + if (aioreq->aiocb == acb) { + free_aio_req(s, aioreq); + } + } - /* - * Sheepdog cannot cancel the requests which are already sent to - * the servers, so we just complete the request with -EIO here. - */ - acb->ret = -EIO; - qemu_coroutine_enter(acb->coroutine, NULL); - acb->canceled = true; + assert(acb->nr_pending == 0); + sd_finish_aiocb(acb); + return; + } + qemu_aio_wait(); + } } static const AIOCBInfo sd_aiocb_info = { @@ -448,7 +518,8 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, acb->nb_sectors = nb_sectors; acb->aio_done_func = NULL; - acb->canceled = false; + acb->cancelable = true; + acb->finished = NULL; acb->coroutine = qemu_coroutine_self(); acb->ret = 0; acb->nr_pending = 0; @@ -489,13 +560,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, int ret; ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); - if (ret < sizeof(*hdr)) { + if (ret != sizeof(*hdr)) { error_report("failed to send a req, %s", strerror(errno)); return ret; } ret = qemu_co_send(sockfd, data, *wlen); - if (ret < *wlen) { + if (ret != *wlen) { error_report("failed to send a req, %s", strerror(errno)); } @@ -509,13 +580,6 @@ static void restart_co_req(void *opaque) qemu_coroutine_enter(co, NULL); } -static int have_co_req(void *opaque) -{ - /* this handler is set only when there is a pending request, so - * always returns 1. */ - return 1; -} - typedef struct SheepdogReqCo { int sockfd; SheepdogReq *hdr; @@ -538,17 +602,17 @@ static coroutine_fn void do_co_req(void *opaque) unsigned int *rlen = srco->rlen; co = qemu_coroutine_self(); - qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, have_co_req, co); + qemu_aio_set_fd_handler(sockfd, NULL, restart_co_req, co); ret = send_co_req(sockfd, hdr, data, wlen); if (ret < 0) { goto out; } - qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, have_co_req, co); + qemu_aio_set_fd_handler(sockfd, restart_co_req, NULL, co); ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); - if (ret < sizeof(*hdr)) { + if (ret != sizeof(*hdr)) { error_report("failed to get a rsp, %s", strerror(errno)); ret = -errno; goto out; @@ -560,7 +624,7 @@ static coroutine_fn void do_co_req(void *opaque) if (*rlen) { ret = qemu_co_recv(sockfd, data, *rlen); - if (ret < *rlen) { + if (ret != *rlen) { error_report("failed to get the data, %s", strerror(errno)); ret = -errno; goto out; @@ -570,7 +634,7 @@ static coroutine_fn void do_co_req(void *opaque) out: /* there is at most one request for this sockfd, so it is safe to * set each handler to NULL. */ - qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(sockfd, NULL, NULL, NULL); srco->ret = ret; srco->finished = true; @@ -603,11 +667,13 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data, return srco.ret; } -static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, struct iovec *iov, int niov, bool create, enum AIOCBState aiocb_type); -static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); - +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); +static int get_sheep_fd(BDRVSheepdogState *s); +static void co_write_request(void *opaque); static AIOReq *find_pending_req(BDRVSheepdogState *s, uint64_t oid) { @@ -630,22 +696,59 @@ static void coroutine_fn send_pending_req(BDRVSheepdogState *s, uint64_t oid) { AIOReq *aio_req; SheepdogAIOCB *acb; - int ret; while ((aio_req = find_pending_req(s, oid)) != NULL) { acb = aio_req->aiocb; /* move aio_req from pending list to inflight one */ QLIST_REMOVE(aio_req, aio_siblings); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, acb->qiov->iov, - acb->qiov->niov, false, acb->aiocb_type); - if (ret < 0) { - error_report("add_aio_request is failed"); - free_aio_req(s, aio_req); - if (!acb->nr_pending) { - sd_finish_aiocb(acb); - } + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, false, + acb->aiocb_type); + } +} + +static coroutine_fn void reconnect_to_sdog(void *opaque) +{ + BDRVSheepdogState *s = opaque; + AIOReq *aio_req, *next; + + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); + close(s->fd); + s->fd = -1; + + /* Wait for outstanding write requests to be completed. */ + while (s->co_send != NULL) { + co_write_request(opaque); + } + + /* Try to reconnect the sheepdog server every one second. */ + while (s->fd < 0) { + s->fd = get_sheep_fd(s); + if (s->fd < 0) { + DPRINTF("Wait for connection to be established\n"); + co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME, + 1000000000ULL); } + }; + + /* + * Now we have to resend all the request in the inflight queue. However, + * resend_aioreq() can yield and newly created requests can be added to the + * inflight queue before the coroutine is resumed. To avoid mixing them, we + * have to move all the inflight requests to the failed queue before + * resend_aioreq() is called. + */ + QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) { + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings); + } + + /* Resend all the failed aio requests. */ + while (!QLIST_EMPTY(&s->failed_aio_head)) { + aio_req = QLIST_FIRST(&s->failed_aio_head); + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + resend_aioreq(s, aio_req); } } @@ -665,15 +768,11 @@ static void coroutine_fn aio_read_response(void *opaque) SheepdogAIOCB *acb; uint64_t idx; - if (QLIST_EMPTY(&s->inflight_aio_head)) { - goto out; - } - /* read a header */ ret = qemu_co_recv(fd, &rsp, sizeof(rsp)); - if (ret < 0) { + if (ret != sizeof(rsp)) { error_report("failed to get the header, %s", strerror(errno)); - goto out; + goto err; } /* find the right aio_req from the inflight aio list */ @@ -684,7 +783,7 @@ static void coroutine_fn aio_read_response(void *opaque) } if (!aio_req) { error_report("cannot find aio_req %x", rsp.id); - goto out; + goto err; } acb = aio_req->aiocb; @@ -722,9 +821,9 @@ static void coroutine_fn aio_read_response(void *opaque) case AIOCB_READ_UDATA: ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov, aio_req->iov_offset, rsp.data_length); - if (ret < 0) { + if (ret != rsp.data_length) { error_report("failed to get the data, %s", strerror(errno)); - goto out; + goto err; } break; case AIOCB_FLUSH_CACHE: @@ -755,11 +854,20 @@ static void coroutine_fn aio_read_response(void *opaque) case SD_RES_SUCCESS: break; case SD_RES_READONLY: - ret = resend_aioreq(s, aio_req); - if (ret == SD_RES_SUCCESS) { - goto out; + if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) { + ret = reload_inode(s, 0, ""); + if (ret < 0) { + goto err; + } } - /* fall through */ + if (is_data_obj(aio_req->oid)) { + aio_req->oid = vid_to_data_oid(s->inode.vdi_id, + data_oid_to_idx(aio_req->oid)); + } else { + aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id); + } + resend_aioreq(s, aio_req); + goto out; default: acb->ret = -EIO; error_report("%s", sd_strerror(rsp.result)); @@ -776,6 +884,10 @@ static void coroutine_fn aio_read_response(void *opaque) } out: s->co_recv = NULL; + return; +err: + s->co_recv = NULL; + reconnect_to_sdog(opaque); } static void co_read_response(void *opaque) @@ -796,18 +908,10 @@ static void co_write_request(void *opaque) qemu_coroutine_enter(s->co_send, NULL); } -static int aio_flush_request(void *opaque) -{ - BDRVSheepdogState *s = opaque; - - return !QLIST_EMPTY(&s->inflight_aio_head) || - !QLIST_EMPTY(&s->pending_aio_head); -} - /* - * Return a socket discriptor to read/write objects. + * Return a socket descriptor to read/write objects. * - * We cannot use this discriptor for other operations because + * We cannot use this descriptor for other operations because * the block driver may be on waiting response from the server. */ static int get_sheep_fd(BDRVSheepdogState *s) @@ -819,7 +923,7 @@ static int get_sheep_fd(BDRVSheepdogState *s) return fd; } - qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request, s); + qemu_aio_set_fd_handler(fd, co_read_response, NULL, s); return fd; } @@ -995,7 +1099,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename, } if (rsp->result != SD_RES_SUCCESS) { - error_report("cannot get vdi info, %s, %s %d %s", + error_report("cannot get vdi info, %s, %s %" PRIu32 " %s", sd_strerror(rsp->result), filename, snapid, tag); if (rsp->result == SD_RES_NO_VDI) { ret = -ENOENT; @@ -1012,7 +1116,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char *filename, return ret; } -static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, struct iovec *iov, int niov, bool create, enum AIOCBState aiocb_type) { @@ -1069,36 +1173,30 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, qemu_co_mutex_lock(&s->lock); s->co_send = qemu_coroutine_self(); - qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, - aio_flush_request, s); + qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request, s); socket_set_cork(s->fd, 1); /* send a header */ ret = qemu_co_send(s->fd, &hdr, sizeof(hdr)); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); + if (ret != sizeof(hdr)) { error_report("failed to send a req, %s", strerror(errno)); - return -errno; + goto out; } if (wlen) { ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); + if (ret != wlen) { error_report("failed to send a data, %s", strerror(errno)); - return -errno; } } - +out: socket_set_cork(s->fd, 0); - qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, - aio_flush_request, s); + qemu_aio_set_fd_handler(s->fd, co_read_response, NULL, s); + s->co_send = NULL; qemu_co_mutex_unlock(&s->lock); - - return 0; } -static int read_write_object(int fd, char *buf, uint64_t oid, int copies, +static int read_write_object(int fd, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool write, bool create, uint32_t cache_flags) { @@ -1146,7 +1244,7 @@ static int read_write_object(int fd, char *buf, uint64_t oid, int copies, } } -static int read_object(int fd, char *buf, uint64_t oid, int copies, +static int read_object(int fd, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, uint32_t cache_flags) { @@ -1154,7 +1252,7 @@ static int read_object(int fd, char *buf, uint64_t oid, int copies, false, cache_flags); } -static int write_object(int fd, char *buf, uint64_t oid, int copies, +static int write_object(int fd, char *buf, uint64_t oid, uint8_t copies, unsigned int datalen, uint64_t offset, bool create, uint32_t cache_flags) { @@ -1198,51 +1296,62 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) return ret; } -static int coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +/* Return true if the specified request is linked to the pending list. */ +static bool check_simultaneous_create(BDRVSheepdogState *s, AIOReq *aio_req) { - SheepdogAIOCB *acb = aio_req->aiocb; - bool create = false; - int ret; - - ret = reload_inode(s, 0, ""); - if (ret < 0) { - return ret; + AIOReq *areq; + QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { + if (areq != aio_req && areq->oid == aio_req->oid) { + /* + * Sheepdog cannot handle simultaneous create requests to the same + * object, so we cannot send the request until the previous request + * finishes. + */ + DPRINTF("simultaneous create to %" PRIx64 "\n", aio_req->oid); + aio_req->flags = 0; + aio_req->base_oid = 0; + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); + return true; + } } - aio_req->oid = vid_to_data_oid(s->inode.vdi_id, - data_oid_to_idx(aio_req->oid)); + return false; +} + +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +{ + SheepdogAIOCB *acb = aio_req->aiocb; + bool create = false; /* check whether this request becomes a CoW one */ - if (acb->aiocb_type == AIOCB_WRITE_UDATA) { + if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) { int idx = data_oid_to_idx(aio_req->oid); - AIOReq *areq; - if (s->inode.data_vdi_id[idx] == 0) { - create = true; - goto out; - } if (is_data_obj_writable(&s->inode, idx)) { goto out; } - /* link to the pending list if there is another CoW request to - * the same object */ - QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { - if (areq != aio_req && areq->oid == aio_req->oid) { - DPRINTF("simultaneous CoW to %" PRIx64 "\n", aio_req->oid); - QLIST_REMOVE(aio_req, aio_siblings); - QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, aio_siblings); - return SD_RES_SUCCESS; - } + if (check_simultaneous_create(s, aio_req)) { + return; } - aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); - aio_req->flags |= SD_FLAG_CMD_COW; + if (s->inode.data_vdi_id[idx]) { + aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); + aio_req->flags |= SD_FLAG_CMD_COW; + } create = true; } out: - return add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, - create, acb->aiocb_type); + if (is_data_obj(aio_req->oid)) { + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create, + acb->aiocb_type); + } else { + struct iovec iov; + iov.iov_base = &s->inode; + iov.iov_len = sizeof(s->inode); + add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); + } } /* TODO Convert to fine grained options */ @@ -1259,7 +1368,8 @@ static QemuOptsList runtime_opts = { }, }; -static int sd_open(BlockDriverState *bs, QDict *options, int flags) +static int sd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { int ret, fd; uint32_t vid = 0; @@ -1271,9 +1381,11 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) Error *local_err = NULL; const char *filename; - opts = qemu_opts_create_nofail(&runtime_opts); + s->bs = bs; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { qerror_report_err(local_err); error_free(local_err); ret = -EINVAL; @@ -1284,6 +1396,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) QLIST_INIT(&s->inflight_aio_head); QLIST_INIT(&s->pending_aio_head); + QLIST_INIT(&s->failed_aio_head); s->fd = -1; memset(vdi, 0, sizeof(vdi)); @@ -1350,7 +1463,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) g_free(buf); return 0; out: - qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); if (s->fd >= 0) { closesocket(s->fd); } @@ -1359,8 +1472,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags) return ret; } -static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, - uint32_t base_vid, uint32_t *vdi_id, int snapshot) +static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot) { SheepdogVdiReq hdr; SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; @@ -1377,11 +1489,11 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, * does not fit in buf? For now, just truncate and avoid buffer overrun. */ memset(buf, 0, sizeof(buf)); - pstrcpy(buf, sizeof(buf), filename); + pstrcpy(buf, sizeof(buf), s->name); memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_NEW_VDI; - hdr.vdi_id = base_vid; + hdr.base_vdi_id = s->inode.vdi_id; wlen = SD_MAX_VDI_LEN; @@ -1389,7 +1501,9 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, hdr.snapid = snapshot; hdr.data_length = wlen; - hdr.vdi_size = vdi_size; + hdr.vdi_size = s->inode.vdi_size; + hdr.copy_policy = s->inode.copy_policy; + hdr.copies = s->inode.nr_copies; ret = do_req(fd, (SheepdogReq *)&hdr, buf, &wlen, &rlen); @@ -1400,7 +1514,7 @@ static int do_sd_create(BDRVSheepdogState *s, char *filename, int64_t vdi_size, } if (rsp->result != SD_RES_SUCCESS) { - error_report("%s, %s", sd_strerror(rsp->result), filename); + error_report("%s, %s", sd_strerror(rsp->result), s->inode.name); return -EIO; } @@ -1417,10 +1531,14 @@ static int sd_prealloc(const char *filename) uint32_t idx, max_idx; int64_t vdi_size; void *buf = g_malloc0(SD_DATA_OBJ_SIZE); + Error *local_err = NULL; int ret; - ret = bdrv_file_open(&bs, filename, NULL, BDRV_O_RDWR); + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); if (ret < 0) { + qerror_report_err(local_err); + error_free(local_err); goto out; } @@ -1447,32 +1565,86 @@ static int sd_prealloc(const char *filename) } out: if (bs) { - bdrv_delete(bs); + bdrv_unref(bs); } g_free(buf); return ret; } -static int sd_create(const char *filename, QEMUOptionParameter *options) +/* + * Sheepdog support two kinds of redundancy, full replication and erasure + * coding. + * + * # create a fully replicated vdi with x copies + * -o redundancy=x (1 <= x <= SD_MAX_COPIES) + * + * # create a erasure coded vdi with x data strips and y parity strips + * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP) + */ +static int parse_redundancy(BDRVSheepdogState *s, const char *opt) +{ + struct SheepdogInode *inode = &s->inode; + const char *n1, *n2; + long copy, parity; + char p[10]; + + pstrcpy(p, sizeof(p), opt); + n1 = strtok(p, ":"); + n2 = strtok(NULL, ":"); + + if (!n1) { + return -EINVAL; + } + + copy = strtol(n1, NULL, 10); + if (copy > SD_MAX_COPIES || copy < 1) { + return -EINVAL; + } + if (!n2) { + inode->copy_policy = 0; + inode->nr_copies = copy; + return 0; + } + + if (copy != 2 && copy != 4 && copy != 8 && copy != 16) { + return -EINVAL; + } + + parity = strtol(n2, NULL, 10); + if (parity >= SD_EC_MAX_STRIP || parity < 1) { + return -EINVAL; + } + + /* + * 4 bits for parity and 4 bits for data. + * We have to compress upper data bits because it can't represent 16 + */ + inode->copy_policy = ((copy / 2) << 4) + parity; + inode->nr_copies = copy + parity; + + return 0; +} + +static int sd_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int ret = 0; - uint32_t vid = 0, base_vid = 0; - int64_t vdi_size = 0; + uint32_t vid = 0; char *backing_file = NULL; BDRVSheepdogState *s; - char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; + char tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid; bool prealloc = false; + Error *local_err = NULL; s = g_malloc0(sizeof(BDRVSheepdogState)); - memset(vdi, 0, sizeof(vdi)); memset(tag, 0, sizeof(tag)); if (strstr(filename, "://")) { - ret = sd_parse_uri(s, filename, vdi, &snapid, tag); + ret = sd_parse_uri(s, filename, s->name, &snapid, tag); } else { - ret = parse_vdiname(s, filename, vdi, &snapid, tag); + ret = parse_vdiname(s, filename, s->name, &snapid, tag); } if (ret < 0) { goto out; @@ -1480,7 +1652,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) while (options && options->name) { if (!strcmp(options->name, BLOCK_OPT_SIZE)) { - vdi_size = options->value.n; + s->inode.vdi_size = options->value.n; } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { backing_file = options->value.s; } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) { @@ -1494,11 +1666,18 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) ret = -EINVAL; goto out; } + } else if (!strcmp(options->name, BLOCK_OPT_REDUNDANCY)) { + if (options->value.s) { + ret = parse_redundancy(s, options->value.s); + if (ret < 0) { + goto out; + } + } } options++; } - if (vdi_size > SD_MAX_VDI_SIZE) { + if (s->inode.vdi_size > SD_MAX_VDI_SIZE) { error_report("too big image size"); ret = -EINVAL; goto out; @@ -1506,7 +1685,7 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) if (backing_file) { BlockDriverState *bs; - BDRVSheepdogState *s; + BDRVSheepdogState *base; BlockDriver *drv; /* Currently, only Sheepdog backing image is supported. */ @@ -1517,25 +1696,28 @@ static int sd_create(const char *filename, QEMUOptionParameter *options) goto out; } - ret = bdrv_file_open(&bs, backing_file, NULL, 0); + bs = NULL; + ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL, + &local_err); if (ret < 0) { + qerror_report_err(local_err); + error_free(local_err); goto out; } - s = bs->opaque; + base = bs->opaque; - if (!is_snapshot(&s->inode)) { + if (!is_snapshot(&base->inode)) { error_report("cannot clone from a non snapshot vdi"); - bdrv_delete(bs); + bdrv_unref(bs); ret = -EINVAL; goto out; } - - base_vid = s->inode.vdi_id; - bdrv_delete(bs); + s->inode.vdi_id = base->inode.vdi_id; + bdrv_unref(bs); } - ret = do_sd_create(s, vdi, vdi_size, base_vid, &vid, 0); + ret = do_sd_create(s, &vid, 0); if (!prealloc || ret) { goto out; } @@ -1564,7 +1746,7 @@ static void sd_close(BlockDriverState *bs) memset(&hdr, 0, sizeof(hdr)); hdr.opcode = SD_OP_RELEASE_VDI; - hdr.vdi_id = s->inode.vdi_id; + hdr.base_vdi_id = s->inode.vdi_id; wlen = strlen(s->name) + 1; hdr.data_length = wlen; hdr.flags = SD_FLAG_CMD_WRITE; @@ -1578,7 +1760,7 @@ static void sd_close(BlockDriverState *bs) error_report("%s, %s", sd_strerror(rsp->result), s->name); } - qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(s->fd, NULL, NULL, NULL); closesocket(s->fd); g_free(s->host_spec); } @@ -1630,7 +1812,6 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) */ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) { - int ret; BDRVSheepdogState *s = acb->common.bs->opaque; struct iovec iov; AIOReq *aio_req; @@ -1652,18 +1833,13 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), data_len, offset, 0, 0, offset); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); - if (ret) { - free_aio_req(s, aio_req); - acb->ret = -EIO; - goto out; - } + add_aio_request(s, aio_req, &iov, 1, false, AIOCB_WRITE_UDATA); acb->aio_done_func = sd_finish_aiocb; acb->aiocb_type = AIOCB_WRITE_UDATA; return; } -out: + sd_finish_aiocb(acb); } @@ -1673,7 +1849,7 @@ static bool sd_delete(BDRVSheepdogState *s) unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; SheepdogVdiReq hdr = { .opcode = SD_OP_DEL_VDI, - .vdi_id = s->inode.vdi_id, + .base_vdi_id = s->inode.vdi_id, .data_length = wlen, .flags = SD_FLAG_CMD_WRITE, }; @@ -1720,12 +1896,11 @@ static int sd_create_branch(BDRVSheepdogState *s) /* * Even If deletion fails, we will just create extra snapshot based on - * the workding VDI which was supposed to be deleted. So no need to + * the working VDI which was supposed to be deleted. So no need to * false bail out. */ deleted = sd_delete(s); - ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &vid, - !deleted); + ret = do_sd_create(s, &vid, !deleted); if (ret) { goto out; } @@ -1849,35 +2024,16 @@ static int coroutine_fn sd_co_rw_vector(void *p) } aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, old_oid, done); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); if (create) { - AIOReq *areq; - QLIST_FOREACH(areq, &s->inflight_aio_head, aio_siblings) { - if (areq->oid == oid) { - /* - * Sheepdog cannot handle simultaneous create - * requests to the same object. So we cannot send - * the request until the previous request - * finishes. - */ - aio_req->flags = 0; - aio_req->base_oid = 0; - QLIST_INSERT_HEAD(&s->pending_aio_head, aio_req, - aio_siblings); - goto done; - } + if (check_simultaneous_create(s, aio_req)) { + goto done; } } - QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, - create, acb->aiocb_type); - if (ret < 0) { - error_report("add_aio_request is failed"); - free_aio_req(s, aio_req); - acb->ret = -EIO; - goto out; - } + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, create, + acb->aiocb_type); done: offset = 0; idx++; @@ -1895,13 +2051,14 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, { SheepdogAIOCB *acb; int ret; + int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; + BDRVSheepdogState *s = bs->opaque; - if (bs->growable && sector_num + nb_sectors > bs->total_sectors) { - ret = sd_truncate(bs, (sector_num + nb_sectors) * BDRV_SECTOR_SIZE); + if (bs->growable && offset > s->inode.vdi_size) { + ret = sd_truncate(bs, offset); if (ret < 0) { return ret; } - bs->total_sectors = sector_num + nb_sectors; } acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); @@ -1945,7 +2102,6 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) BDRVSheepdogState *s = bs->opaque; SheepdogAIOCB *acb; AIOReq *aio_req; - int ret; if (s->cache_flags != SD_FLAG_CMD_CACHE) { return 0; @@ -1958,13 +2114,7 @@ static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), 0, 0, 0, 0, 0); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - ret = add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type); - if (ret < 0) { - error_report("add_aio_request is failed"); - free_aio_req(s, aio_req); - qemu_aio_release(acb); - return ret; - } + add_aio_request(s, aio_req, NULL, 0, false, acb->aiocb_type); qemu_coroutine_yield(); return acb->ret; @@ -2014,8 +2164,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto cleanup; } - ret = do_sd_create(s, s->name, s->inode.vdi_size, s->inode.vdi_id, &new_vid, - 1); + ret = do_sd_create(s, &new_vid, 1); if (ret < 0) { error_report("failed to create inode for snapshot. %s", strerror(errno)); @@ -2045,7 +2194,7 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) * We implement rollback(loadvm) operation to the specified snapshot by * 1) switch to the snapshot * 2) rely on sd_create_branch to delete working VDI and - * 3) create a new working VDI based on the speicified snapshot + * 3) create a new working VDI based on the specified snapshot */ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) { @@ -2089,7 +2238,10 @@ static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) return ret; } -static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +static int sd_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { /* FIXME: Delete specified snapshot id. */ return 0; @@ -2164,8 +2316,8 @@ static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) sn_tab[found].vm_state_size = inode.vm_state_size; sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec; - snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), "%u", - inode.snap_id); + snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), + "%" PRIu32, inode.snap_id); pstrcpy(sn_tab[found].name, MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)), inode.tag); @@ -2287,17 +2439,18 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, return acb->ret; } -static coroutine_fn int -sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum) +static coroutine_fn int64_t +sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; - unsigned long start = sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE, + uint64_t offset = sector_num * BDRV_SECTOR_SIZE; + unsigned long start = offset / SD_DATA_OBJ_SIZE, end = DIV_ROUND_UP((sector_num + nb_sectors) * BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); unsigned long idx; - int ret = 1; + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; for (idx = start; idx < end; idx++) { if (inode->data_vdi_id[idx] == 0) { @@ -2321,6 +2474,22 @@ sd_co_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, return ret; } +static int64_t sd_get_allocated_file_size(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogInode *inode = &s->inode; + unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); + uint64_t size = 0; + + for (i = 0; i < last; i++) { + if (inode->data_vdi_id[i] == 0) { + continue; + } + size += SD_DATA_OBJ_SIZE; + } + return size; +} + static QEMUOptionParameter sd_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -2337,6 +2506,11 @@ static QEMUOptionParameter sd_create_options[] = { .type = OPT_STRING, .help = "Preallocation mode (allowed values: off, full)" }, + { + .name = BLOCK_OPT_REDUNDANCY, + .type = OPT_STRING, + .help = "Redundancy of the image" + }, { NULL } }; @@ -2344,18 +2518,20 @@ static BlockDriver bdrv_sheepdog = { .format_name = "sheepdog", .protocol_name = "sheepdog", .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, .bdrv_file_open = sd_open, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_truncate = sd_truncate, .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, - .bdrv_co_is_allocated = sd_co_is_allocated, + .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2372,18 +2548,20 @@ static BlockDriver bdrv_sheepdog_tcp = { .format_name = "sheepdog", .protocol_name = "sheepdog+tcp", .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, .bdrv_file_open = sd_open, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_truncate = sd_truncate, .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, - .bdrv_co_is_allocated = sd_co_is_allocated, + .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, @@ -2400,18 +2578,20 @@ static BlockDriver bdrv_sheepdog_unix = { .format_name = "sheepdog", .protocol_name = "sheepdog+unix", .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, .bdrv_file_open = sd_open, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, .bdrv_truncate = sd_truncate, .bdrv_co_readv = sd_co_readv, .bdrv_co_writev = sd_co_writev, .bdrv_co_flush_to_disk = sd_co_flush_to_disk, .bdrv_co_discard = sd_co_discard, - .bdrv_co_is_allocated = sd_co_is_allocated, + .bdrv_co_get_block_status = sd_co_get_block_status, .bdrv_snapshot_create = sd_snapshot_create, .bdrv_snapshot_goto = sd_snapshot_goto, diff --git a/block/snapshot.c b/block/snapshot.c index 6c6d9deea1..85c52ff455 100644 --- a/block/snapshot.c +++ b/block/snapshot.c @@ -25,6 +25,24 @@ #include "block/snapshot.h" #include "block/block_int.h" +QemuOptsList internal_snapshot_opts = { + .name = "snapshot", + .head = QTAILQ_HEAD_INITIALIZER(internal_snapshot_opts.head), + .desc = { + { + .name = SNAPSHOT_OPT_ID, + .type = QEMU_OPT_STRING, + .help = "snapshot id" + },{ + .name = SNAPSHOT_OPT_NAME, + .type = QEMU_OPT_STRING, + .help = "snapshot name" + },{ + /* end of list */ + } + }, +}; + int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, const char *name) { @@ -48,6 +66,79 @@ int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, return ret; } +/** + * Look up an internal snapshot by @id and @name. + * @bs: block device to search + * @id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @sn_info: location to store information on the snapshot found + * @errp: location to store error, will be set only for exception + * + * This function will traverse snapshot list in @bs to search the matching + * one, @id and @name are the matching condition: + * If both @id and @name are specified, find the first one with id @id and + * name @name. + * If only @id is specified, find the first one with id @id. + * If only @name is specified, find the first one with name @name. + * if none is specified, abort(). + * + * Returns: true when a snapshot is found and @sn_info will be filled, false + * when error or not found. If all operation succeed but no matching one is + * found, @errp will NOT be set. + */ +bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, + const char *id, + const char *name, + QEMUSnapshotInfo *sn_info, + Error **errp) +{ + QEMUSnapshotInfo *sn_tab, *sn; + int nb_sns, i; + bool ret = false; + + assert(id || name); + + nb_sns = bdrv_snapshot_list(bs, &sn_tab); + if (nb_sns < 0) { + error_setg_errno(errp, -nb_sns, "Failed to get a snapshot list"); + return false; + } else if (nb_sns == 0) { + return false; + } + + if (id && name) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, id) && !strcmp(sn->name, name)) { + *sn_info = *sn; + ret = true; + break; + } + } + } else if (id) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, id)) { + *sn_info = *sn; + ret = true; + break; + } + } + } else if (name) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->name, name)) { + *sn_info = *sn; + ret = true; + break; + } + } + } + + g_free(sn_tab); + return ret; +} + int bdrv_can_snapshot(BlockDriverState *bs) { BlockDriver *drv = bs->drv; @@ -97,9 +188,9 @@ int bdrv_snapshot_goto(BlockDriverState *bs, if (bs->file) { drv->bdrv_close(bs); ret = bdrv_snapshot_goto(bs->file, snapshot_id); - open_ret = drv->bdrv_open(bs, NULL, bs->open_flags); + open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL); if (open_ret < 0) { - bdrv_delete(bs->file); + bdrv_unref(bs->file); bs->drv = NULL; return open_ret; } @@ -109,21 +200,73 @@ int bdrv_snapshot_goto(BlockDriverState *bs, return -ENOTSUP; } -int bdrv_snapshot_delete(BlockDriverState *bs, const char *snapshot_id) +/** + * Delete an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, delete the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, delete the first one with id + * @snapshot_id. + * If only @name is specified, delete the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on failure. If @bs is not inserted, return + * -ENOMEDIUM. If @snapshot_id and @name are both NULL, return -EINVAL. If @bs + * does not support internal snapshot deletion, return -ENOTSUP. If @bs does + * not support parameter @snapshot_id or @name, or one of them is not correctly + * specified, return -EINVAL. If @bs can't find one matching @id and @name, + * return -ENOENT. If @errp != NULL, it will always be filled with error + * message on failure. + */ +int bdrv_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) { BlockDriver *drv = bs->drv; if (!drv) { + error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); return -ENOMEDIUM; } + if (!snapshot_id && !name) { + error_setg(errp, "snapshot_id and name are both NULL"); + return -EINVAL; + } if (drv->bdrv_snapshot_delete) { - return drv->bdrv_snapshot_delete(bs, snapshot_id); + return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); } if (bs->file) { - return bdrv_snapshot_delete(bs->file, snapshot_id); + return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp); } + error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + drv->format_name, bdrv_get_device_name(bs), + "internal snapshot deletion"); return -ENOTSUP; } +void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp) +{ + int ret; + Error *local_err = NULL; + + ret = bdrv_snapshot_delete(bs, id_or_name, NULL, &local_err); + if (ret == -ENOENT || ret == -EINVAL) { + error_free(local_err); + local_err = NULL; + ret = bdrv_snapshot_delete(bs, NULL, id_or_name, &local_err); + } + + if (ret < 0) { + error_propagate(errp, local_err); + } +} + int bdrv_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_info) { @@ -140,18 +283,71 @@ int bdrv_snapshot_list(BlockDriverState *bs, return -ENOTSUP; } +/** + * Temporarily load an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, load the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, load the first one with id + * @snapshot_id. + * If only @name is specified, load the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on fail. If @bs is not inserted, return + * -ENOMEDIUM. If @bs is not readonly, return -EINVAL. If @bs did not support + * internal snapshot, return -ENOTSUP. If qemu can't find a matching @id and + * @name, return -ENOENT. If @errp != NULL, it will always be filled on + * failure. + */ int bdrv_snapshot_load_tmp(BlockDriverState *bs, - const char *snapshot_name) + const char *snapshot_id, + const char *name, + Error **errp) { BlockDriver *drv = bs->drv; + if (!drv) { + error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); return -ENOMEDIUM; } + if (!snapshot_id && !name) { + error_setg(errp, "snapshot_id and name are both NULL"); + return -EINVAL; + } if (!bs->read_only) { + error_setg(errp, "Device is not readonly"); return -EINVAL; } if (drv->bdrv_snapshot_load_tmp) { - return drv->bdrv_snapshot_load_tmp(bs, snapshot_name); + return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp); } + error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + drv->format_name, bdrv_get_device_name(bs), + "temporarily load internal snapshot"); return -ENOTSUP; } + +int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp) +{ + int ret; + Error *local_err = NULL; + + ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err); + if (ret == -ENOENT || ret == -EINVAL) { + error_free(local_err); + local_err = NULL; + ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err); + } + + if (local_err) { + error_propagate(errp, local_err); + } + + return ret; +} diff --git a/block/ssh.c b/block/ssh.c index d7e7bf8dd2..aa63c9d20e 100644 --- a/block/ssh.c +++ b/block/ssh.c @@ -608,7 +608,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options, return ret; } -static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags) +static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, + Error **errp) { BDRVSSHState *s = bs->opaque; int ret; @@ -650,7 +651,8 @@ static QEMUOptionParameter ssh_create_options[] = { { NULL } }; -static int ssh_create(const char *filename, QEMUOptionParameter *options) +static int ssh_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int r, ret; Error *local_err = NULL; @@ -740,14 +742,6 @@ static void restart_coroutine(void *opaque) qemu_coroutine_enter(co, NULL); } -/* Always true because when we have called set_fd_handler there is - * always a request being processed. - */ -static int return_true(void *opaque) -{ - return 1; -} - static coroutine_fn void set_fd_handler(BDRVSSHState *s) { int r; @@ -766,13 +760,13 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s) DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock, rd_handler, wr_handler); - qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, return_true, co); + qemu_aio_set_fd_handler(s->sock, rd_handler, wr_handler, co); } static coroutine_fn void clear_fd_handler(BDRVSSHState *s) { DPRINTF("s->sock=%d", s->sock); - qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL); + qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL); } /* A non-blocking call returned EAGAIN, so yield, ensuring the diff --git a/block/stream.c b/block/stream.c index 7fe9e486bf..dd0b4ac3d2 100644 --- a/block/stream.c +++ b/block/stream.c @@ -57,6 +57,11 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, BlockDriverState *intermediate; intermediate = top->backing_hd; + /* Must assign before bdrv_delete() to prevent traversing dangling pointer + * while we delete backing image instances. + */ + top->backing_hd = base; + while (intermediate) { BlockDriverState *unused; @@ -68,9 +73,10 @@ static void close_unused_images(BlockDriverState *top, BlockDriverState *base, unused = intermediate; intermediate = intermediate->backing_hd; unused->backing_hd = NULL; - bdrv_delete(unused); + bdrv_unref(unused); } - top->backing_hd = base; + + bdrv_refresh_limits(top); } static void coroutine_fn stream_run(void *opaque) @@ -84,6 +90,11 @@ static void coroutine_fn stream_run(void *opaque) int n = 0; void *buf; + if (!bs->backing_hd) { + block_job_completed(&s->common, 0); + return; + } + s->common.len = bdrv_getlength(bs); if (s->common.len < 0) { block_job_completed(&s->common, s->common.len); @@ -110,21 +121,22 @@ static void coroutine_fn stream_run(void *opaque) /* Note that even when no rate limit is applied we need to yield * with no pending I/O here so that bdrv_drain_all() returns. */ - block_job_sleep_ns(&s->common, rt_clock, delay_ns); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); if (block_job_is_cancelled(&s->common)) { break; } - ret = bdrv_co_is_allocated(bs, sector_num, - STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); + copy = false; + + ret = bdrv_is_allocated(bs, sector_num, + STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); if (ret == 1) { /* Allocated in the top, no need to copy. */ - copy = false; - } else { + } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [sector_num, sector_num+n). */ - ret = bdrv_co_is_allocated_above(bs->backing_hd, base, - sector_num, n, &n); + ret = bdrv_is_allocated_above(bs->backing_hd, base, + sector_num, n, &n); /* Finish early if end of backing file has been reached */ if (ret == 0 && n == 0) { @@ -134,7 +146,7 @@ static void coroutine_fn stream_run(void *opaque) copy = (ret == 1); } trace_stream_one_iteration(s, sector_num, n, ret); - if (ret >= 0 && copy) { + if (copy) { if (s->common.speed) { delay_ns = ratelimit_calculate_delay(&s->limit, n); if (delay_ns > 0) { @@ -198,9 +210,9 @@ static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); } -static const BlockJobType stream_job_type = { +static const BlockJobDriver stream_job_driver = { .instance_size = sizeof(StreamBlockJob), - .job_type = "stream", + .job_type = BLOCK_JOB_TYPE_STREAM, .set_speed = stream_set_speed, }; @@ -219,7 +231,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base, return; } - s = block_job_create(&stream_job_type, bs, speed, cb, opaque, errp); + s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp); if (!s) { return; } diff --git a/block/vdi.c b/block/vdi.c index 8a915257e8..27737af555 100644 --- a/block/vdi.c +++ b/block/vdi.c @@ -31,7 +31,7 @@ * Allocation of blocks could be optimized (less writes to block map and * header). * - * Read and write of adjacents blocks could be done in one operation + * Read and write of adjacent blocks could be done in one operation * (current code uses one operation per block (1 MiB). * * The code is not thread safe (missing locks for changes in header and @@ -120,6 +120,11 @@ typedef unsigned char uuid_t[16]; #define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED) +/* max blocks in image is (0xffffffff / 4) */ +#define VDI_BLOCKS_IN_IMAGE_MAX 0x3fffffff +#define VDI_DISK_SIZE_MAX ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \ + (uint64_t)DEFAULT_CLUSTER_SIZE) + #if !defined(CONFIG_UUID) static inline void uuid_generate(uuid_t out) { @@ -165,7 +170,7 @@ typedef struct { uuid_t uuid_link; uuid_t uuid_parent; uint64_t unused2[7]; -} VdiHeader; +} QEMU_PACKED VdiHeader; typedef struct { /* The block map entries are little endian (even in memory). */ @@ -331,6 +336,7 @@ static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) logout("\n"); bdi->cluster_size = s->block_size; bdi->vm_state_offset = 0; + bdi->unallocated_blocks_are_zero = true; return 0; } @@ -364,7 +370,8 @@ static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) return result; } -static int vdi_open(BlockDriverState *bs, QDict *options, int flags) +static int vdi_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVdiState *s = bs->opaque; VdiHeader header; @@ -383,6 +390,14 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags) vdi_header_print(&header); #endif + if (header.disk_size > VDI_DISK_SIZE_MAX) { + error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 + ", max supported is 0x%" PRIx64 ")", + header.disk_size, VDI_DISK_SIZE_MAX); + ret = -ENOTSUP; + goto fail; + } + if (header.disk_size % SECTOR_SIZE != 0) { /* 'VBoxManage convertfromraw' can create images with odd disk sizes. We accept them but round the disk size to the next multiple of @@ -393,43 +408,57 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags) } if (header.signature != VDI_SIGNATURE) { - logout("bad vdi signature %08x\n", header.signature); - ret = -EMEDIUMTYPE; + error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32 + ")", header.signature); + ret = -EINVAL; goto fail; } else if (header.version != VDI_VERSION_1_1) { - logout("unsupported version %u.%u\n", - header.version >> 16, header.version & 0xffff); + error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32 + ")", header.version >> 16, header.version & 0xffff); ret = -ENOTSUP; goto fail; } else if (header.offset_bmap % SECTOR_SIZE != 0) { /* We only support block maps which start on a sector boundary. */ - logout("unsupported block map offset 0x%x B\n", header.offset_bmap); + error_setg(errp, "unsupported VDI image (unaligned block map offset " + "0x%" PRIx32 ")", header.offset_bmap); ret = -ENOTSUP; goto fail; } else if (header.offset_data % SECTOR_SIZE != 0) { /* We only support data blocks which start on a sector boundary. */ - logout("unsupported data offset 0x%x B\n", header.offset_data); + error_setg(errp, "unsupported VDI image (unaligned data offset 0x%" + PRIx32 ")", header.offset_data); ret = -ENOTSUP; goto fail; } else if (header.sector_size != SECTOR_SIZE) { - logout("unsupported sector size %u B\n", header.sector_size); + error_setg(errp, "unsupported VDI image (sector size %" PRIu32 + " is not %u)", header.sector_size, SECTOR_SIZE); ret = -ENOTSUP; goto fail; - } else if (header.block_size != 1 * MiB) { - logout("unsupported block size %u B\n", header.block_size); + } else if (header.block_size != DEFAULT_CLUSTER_SIZE) { + error_setg(errp, "unsupported VDI image (block size %" PRIu32 + " is not %u)", header.block_size, DEFAULT_CLUSTER_SIZE); ret = -ENOTSUP; goto fail; } else if (header.disk_size > (uint64_t)header.blocks_in_image * header.block_size) { - logout("unsupported disk size %" PRIu64 " B\n", header.disk_size); + error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", " + "image bitmap has room for %" PRIu64 ")", + header.disk_size, + (uint64_t)header.blocks_in_image * header.block_size); ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_link)) { - logout("link uuid != 0, unsupported\n"); + error_setg(errp, "unsupported VDI image (non-NULL link UUID)"); ret = -ENOTSUP; goto fail; } else if (!uuid_is_null(header.uuid_parent)) { - logout("parent uuid != 0, unsupported\n"); + error_setg(errp, "unsupported VDI image (non-NULL parent UUID)"); + ret = -ENOTSUP; + goto fail; + } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) { + error_setg(errp, "unsupported VDI image " + "(too many blocks %u, max is %u)", + header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX); ret = -ENOTSUP; goto fail; } @@ -470,7 +499,7 @@ static int vdi_reopen_prepare(BDRVReopenState *state, return 0; } -static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ @@ -479,12 +508,23 @@ static int coroutine_fn vdi_co_is_allocated(BlockDriverState *bs, size_t sector_in_block = sector_num % s->block_sectors; int n_sectors = s->block_sectors - sector_in_block; uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]); + uint64_t offset; + int result; + logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum); if (n_sectors > nb_sectors) { n_sectors = nb_sectors; } *pnum = n_sectors; - return VDI_IS_ALLOCATED(bmap_entry); + result = VDI_IS_ALLOCATED(bmap_entry); + if (!result) { + return 0; + } + + offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + sector_in_block * SECTOR_SIZE; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; } static int vdi_co_read(BlockDriverState *bs, @@ -633,7 +673,8 @@ static int vdi_co_write(BlockDriverState *bs, return ret; } -static int vdi_create(const char *filename, QEMUOptionParameter *options) +static int vdi_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { int fd; int result = 0; @@ -668,11 +709,20 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options) options++; } + if (bytes > VDI_DISK_SIZE_MAX) { + result = -ENOTSUP; + error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 + ", max supported is 0x%" PRIx64 ")", + bytes, VDI_DISK_SIZE_MAX); + goto exit; + } + fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644); if (fd < 0) { - return -errno; + result = -errno; + goto exit; } /* We need enough blocks to store the given disk size, @@ -706,6 +756,7 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options) vdi_header_to_le(&header); if (write(fd, &header, sizeof(header)) < 0) { result = -errno; + goto close_and_exit; } if (bmap_size > 0) { @@ -719,6 +770,8 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options) } if (write(fd, bmap, bmap_size) < 0) { result = -errno; + g_free(bmap); + goto close_and_exit; } g_free(bmap); } @@ -726,13 +779,16 @@ static int vdi_create(const char *filename, QEMUOptionParameter *options) if (image_type == VDI_TYPE_STATIC) { if (ftruncate(fd, sizeof(header) + bmap_size + blocks * block_size)) { result = -errno; + goto close_and_exit; } } - if (close(fd) < 0) { +close_and_exit: + if ((close(fd) < 0) && !result) { result = -errno; } +exit: return result; } @@ -780,7 +836,7 @@ static BlockDriver bdrv_vdi = { .bdrv_reopen_prepare = vdi_reopen_prepare, .bdrv_create = vdi_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_is_allocated = vdi_co_is_allocated, + .bdrv_co_get_block_status = vdi_co_get_block_status, .bdrv_make_empty = vdi_make_empty, .bdrv_read = vdi_co_read, diff --git a/block/vhdx-endian.c b/block/vhdx-endian.c new file mode 100644 index 0000000000..fe879ed995 --- /dev/null +++ b/block/vhdx-endian.c @@ -0,0 +1,216 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/vhdx.h" + +#include + + +/* + * All the VHDX formats on disk are little endian - the following + * are helper import/export functions to correctly convert + * endianness from disk read to native cpu format, and back again. + */ + + +/* VHDX File Header */ + + +void vhdx_header_le_import(VHDXHeader *h) +{ + assert(h != NULL); + + le32_to_cpus(&h->signature); + le32_to_cpus(&h->checksum); + le64_to_cpus(&h->sequence_number); + + leguid_to_cpus(&h->file_write_guid); + leguid_to_cpus(&h->data_write_guid); + leguid_to_cpus(&h->log_guid); + + le16_to_cpus(&h->log_version); + le16_to_cpus(&h->version); + le32_to_cpus(&h->log_length); + le64_to_cpus(&h->log_offset); +} + +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h) +{ + assert(orig_h != NULL); + assert(new_h != NULL); + + new_h->signature = cpu_to_le32(orig_h->signature); + new_h->checksum = cpu_to_le32(orig_h->checksum); + new_h->sequence_number = cpu_to_le64(orig_h->sequence_number); + + new_h->file_write_guid = orig_h->file_write_guid; + new_h->data_write_guid = orig_h->data_write_guid; + new_h->log_guid = orig_h->log_guid; + + cpu_to_leguids(&new_h->file_write_guid); + cpu_to_leguids(&new_h->data_write_guid); + cpu_to_leguids(&new_h->log_guid); + + new_h->log_version = cpu_to_le16(orig_h->log_version); + new_h->version = cpu_to_le16(orig_h->version); + new_h->log_length = cpu_to_le32(orig_h->log_length); + new_h->log_offset = cpu_to_le64(orig_h->log_offset); +} + + +/* VHDX Log Headers */ + + +void vhdx_log_desc_le_import(VHDXLogDescriptor *d) +{ + assert(d != NULL); + + le32_to_cpus(&d->signature); + le32_to_cpus(&d->trailing_bytes); + le64_to_cpus(&d->leading_bytes); + le64_to_cpus(&d->file_offset); + le64_to_cpus(&d->sequence_number); +} + +void vhdx_log_desc_le_export(VHDXLogDescriptor *d) +{ + assert(d != NULL); + + cpu_to_le32s(&d->signature); + cpu_to_le32s(&d->trailing_bytes); + cpu_to_le64s(&d->leading_bytes); + cpu_to_le64s(&d->file_offset); + cpu_to_le64s(&d->sequence_number); +} + +void vhdx_log_data_le_export(VHDXLogDataSector *d) +{ + assert(d != NULL); + + cpu_to_le32s(&d->data_signature); + cpu_to_le32s(&d->sequence_high); + cpu_to_le32s(&d->sequence_low); +} + +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr) +{ + assert(hdr != NULL); + + le32_to_cpus(&hdr->signature); + le32_to_cpus(&hdr->checksum); + le32_to_cpus(&hdr->entry_length); + le32_to_cpus(&hdr->tail); + le64_to_cpus(&hdr->sequence_number); + le32_to_cpus(&hdr->descriptor_count); + leguid_to_cpus(&hdr->log_guid); + le64_to_cpus(&hdr->flushed_file_offset); + le64_to_cpus(&hdr->last_file_offset); +} + +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le32s(&hdr->signature); + cpu_to_le32s(&hdr->checksum); + cpu_to_le32s(&hdr->entry_length); + cpu_to_le32s(&hdr->tail); + cpu_to_le64s(&hdr->sequence_number); + cpu_to_le32s(&hdr->descriptor_count); + cpu_to_leguids(&hdr->log_guid); + cpu_to_le64s(&hdr->flushed_file_offset); + cpu_to_le64s(&hdr->last_file_offset); +} + + +/* Region table entries */ +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr) +{ + assert(hdr != NULL); + + le32_to_cpus(&hdr->signature); + le32_to_cpus(&hdr->checksum); + le32_to_cpus(&hdr->entry_count); +} + +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le32s(&hdr->signature); + cpu_to_le32s(&hdr->checksum); + cpu_to_le32s(&hdr->entry_count); +} + +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e) +{ + assert(e != NULL); + + leguid_to_cpus(&e->guid); + le64_to_cpus(&e->file_offset); + le32_to_cpus(&e->length); + le32_to_cpus(&e->data_bits); +} + +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e) +{ + assert(e != NULL); + + cpu_to_leguids(&e->guid); + cpu_to_le64s(&e->file_offset); + cpu_to_le32s(&e->length); + cpu_to_le32s(&e->data_bits); +} + + +/* Metadata headers & table */ +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr) +{ + assert(hdr != NULL); + + le64_to_cpus(&hdr->signature); + le16_to_cpus(&hdr->entry_count); +} + +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le64s(&hdr->signature); + cpu_to_le16s(&hdr->entry_count); +} + +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e) +{ + assert(e != NULL); + + leguid_to_cpus(&e->item_id); + le32_to_cpus(&e->offset); + le32_to_cpus(&e->length); + le32_to_cpus(&e->data_bits); +} +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e) +{ + assert(e != NULL); + + cpu_to_leguids(&e->item_id); + cpu_to_le32s(&e->offset); + cpu_to_le32s(&e->length); + cpu_to_le32s(&e->data_bits); +} diff --git a/block/vhdx-log.c b/block/vhdx-log.c new file mode 100644 index 0000000000..a77c040ee0 --- /dev/null +++ b/block/vhdx-log.c @@ -0,0 +1,1021 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This file covers the functionality of the metadata log writing, parsing, and + * replay. + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "block/vhdx.h" + + +typedef struct VHDXLogSequence { + bool valid; + uint32_t count; + VHDXLogEntries log; + VHDXLogEntryHeader hdr; +} VHDXLogSequence; + +typedef struct VHDXLogDescEntries { + VHDXLogEntryHeader hdr; + VHDXLogDescriptor desc[]; +} VHDXLogDescEntries; + +static const MSGUID zero_guid = { 0 }; + +/* The log located on the disk is circular buffer containing + * sectors of 4096 bytes each. + * + * It is assumed for the read/write functions below that the + * circular buffer scheme uses a 'one sector open' to indicate + * the buffer is full. Given the validation methods used for each + * sector, this method should be compatible with other methods that + * do not waste a sector. + */ + + +/* Allow peeking at the hdr entry at the beginning of the current + * read index, without advancing the read index */ +static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, + VHDXLogEntryHeader *hdr) +{ + int ret = 0; + uint64_t offset; + uint32_t read; + + assert(hdr != NULL); + + /* peek is only supported on sector boundaries */ + if (log->read % VHDX_LOG_SECTOR_SIZE) { + ret = -EFAULT; + goto exit; + } + + read = log->read; + /* we are guaranteed that a) log sectors are 4096 bytes, + * and b) the log length is a multiple of 1MB. So, there + * is always a round number of sectors in the buffer */ + if ((read + sizeof(VHDXLogEntryHeader)) > log->length) { + read = 0; + } + + if (read == log->write) { + ret = -EINVAL; + goto exit; + } + + offset = log->offset + read; + + ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader)); + if (ret < 0) { + goto exit; + } + +exit: + return ret; +} + +/* Index increment for log, based on sector boundaries */ +static int vhdx_log_inc_idx(uint32_t idx, uint64_t length) +{ + idx += VHDX_LOG_SECTOR_SIZE; + /* we are guaranteed that a) log sectors are 4096 bytes, + * and b) the log length is a multiple of 1MB. So, there + * is always a round number of sectors in the buffer */ + return idx >= length ? 0 : idx; +} + + +/* Reset the log to empty */ +static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s) +{ + MSGUID guid = { 0 }; + s->log.read = s->log.write = 0; + /* a log guid of 0 indicates an empty log to any parser of v0 + * VHDX logs */ + vhdx_update_headers(bs, s, false, &guid); +} + +/* Reads num_sectors from the log (all log sectors are 4096 bytes), + * into buffer 'buffer'. Upon return, *sectors_read will contain + * the number of sectors successfully read. + * + * It is assumed that 'buffer' is already allocated, and of sufficient + * size (i.e. >= 4096*num_sectors). + * + * If 'peek' is true, then the tail (read) pointer for the circular buffer is + * not modified. + * + * 0 is returned on success, -errno otherwise. */ +static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log, + uint32_t *sectors_read, void *buffer, + uint32_t num_sectors, bool peek) +{ + int ret = 0; + uint64_t offset; + uint32_t read; + + read = log->read; + + *sectors_read = 0; + while (num_sectors) { + if (read == log->write) { + /* empty */ + break; + } + offset = log->offset + read; + + ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + read = vhdx_log_inc_idx(read, log->length); + + *sectors_read = *sectors_read + 1; + num_sectors--; + } + +exit: + if (!peek) { + log->read = read; + } + return ret; +} + +/* Writes num_sectors to the log (all log sectors are 4096 bytes), + * from buffer 'buffer'. Upon return, *sectors_written will contain + * the number of sectors successfully written. + * + * It is assumed that 'buffer' is at least 4096*num_sectors large. + * + * 0 is returned on success, -errno otherwise */ +static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, + uint32_t *sectors_written, void *buffer, + uint32_t num_sectors) +{ + int ret = 0; + uint64_t offset; + uint32_t write; + void *buffer_tmp; + BDRVVHDXState *s = bs->opaque; + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + write = log->write; + + buffer_tmp = buffer; + while (num_sectors) { + + offset = log->offset + write; + write = vhdx_log_inc_idx(write, log->length); + if (write == log->read) { + /* full */ + break; + } + ret = bdrv_pwrite(bs->file, offset, buffer_tmp, VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + buffer_tmp += VHDX_LOG_SECTOR_SIZE; + + log->write = write; + *sectors_written = *sectors_written + 1; + num_sectors--; + } + +exit: + return ret; +} + + +/* Validates a log entry header */ +static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr, + BDRVVHDXState *s) +{ + int valid = false; + + if (memcmp(&hdr->signature, "loge", 4)) { + goto exit; + } + + /* if the individual entry length is larger than the whole log + * buffer, that is obviously invalid */ + if (log->length < hdr->entry_length) { + goto exit; + } + + /* length of entire entry must be in units of 4KB (log sector size) */ + if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) { + goto exit; + } + + /* per spec, sequence # must be > 0 */ + if (hdr->sequence_number == 0) { + goto exit; + } + + /* log entries are only valid if they match the file-wide log guid + * found in the active header */ + if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) { + goto exit; + } + + if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) { + goto exit; + } + + valid = true; + +exit: + return valid; +} + +/* + * Given a log header, this will validate that the descriptors and the + * corresponding data sectors (if applicable) + * + * Validation consists of: + * 1. Making sure the sequence numbers matches the entry header + * 2. Verifying a valid signature ('zero' or 'desc' for descriptors) + * 3. File offset field is a multiple of 4KB + * 4. If a data descriptor, the corresponding data sector + * has its signature ('data') and matching sequence number + * + * @desc: the data buffer containing the descriptor + * @hdr: the log entry header + * + * Returns true if valid + */ +static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc, + VHDXLogEntryHeader *hdr) +{ + bool ret = false; + + if (desc->sequence_number != hdr->sequence_number) { + goto exit; + } + if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) { + goto exit; + } + + if (!memcmp(&desc->signature, "zero", 4)) { + if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) { + /* valid */ + ret = true; + } + } else if (!memcmp(&desc->signature, "desc", 4)) { + /* valid */ + ret = true; + } + +exit: + return ret; +} + + +/* Prior to sector data for a log entry, there is the header + * and the descriptors referenced in the header: + * + * [] = 4KB sector + * + * [ hdr, desc ][ desc ][ ... ][ data ][ ... ] + * + * The first sector in a log entry has a 64 byte header, and + * up to 126 32-byte descriptors. If more descriptors than + * 126 are required, then subsequent sectors can have up to 128 + * descriptors. Each sector is 4KB. Data follows the descriptor + * sectors. + * + * This will return the number of sectors needed to encompass + * the passed number of descriptors in desc_cnt. + * + * This will never return 0, even if desc_cnt is 0. + */ +static int vhdx_compute_desc_sectors(uint32_t desc_cnt) +{ + uint32_t desc_sectors; + + desc_cnt += 2; /* account for header in first sector */ + desc_sectors = desc_cnt / 128; + if (desc_cnt % 128) { + desc_sectors++; + } + + return desc_sectors; +} + + +/* Reads the log header, and subsequent descriptors (if any). This + * will allocate all the space for buffer, which must be NULL when + * passed into this function. Each descriptor will also be validated, + * and error returned if any are invalid. */ +static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogEntries *log, VHDXLogDescEntries **buffer) +{ + int ret = 0; + uint32_t desc_sectors; + uint32_t sectors_read; + VHDXLogEntryHeader hdr; + VHDXLogDescEntries *desc_entries = NULL; + int i; + + assert(*buffer == NULL); + + ret = vhdx_log_peek_hdr(bs, log, &hdr); + if (ret < 0) { + goto exit; + } + vhdx_log_entry_hdr_le_import(&hdr); + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { + ret = -EINVAL; + goto exit; + } + + desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); + desc_entries = qemu_blockalign(bs, desc_sectors * VHDX_LOG_SECTOR_SIZE); + + ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries, + desc_sectors, false); + if (ret < 0) { + goto free_and_exit; + } + if (sectors_read != desc_sectors) { + ret = -EINVAL; + goto free_and_exit; + } + + /* put in proper endianness, and validate each desc */ + for (i = 0; i < hdr.descriptor_count; i++) { + vhdx_log_desc_le_import(&desc_entries->desc[i]); + if (vhdx_log_desc_is_valid(&desc_entries->desc[i], &hdr) == false) { + ret = -EINVAL; + goto free_and_exit; + } + } + + *buffer = desc_entries; + goto exit; + +free_and_exit: + qemu_vfree(desc_entries); +exit: + return ret; +} + + +/* Flushes the descriptor described by desc to the VHDX image file. + * If the descriptor is a data descriptor, than 'data' must be non-NULL, + * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be + * written. + * + * Verification is performed to make sure the sequence numbers of a data + * descriptor match the sequence number in the desc. + * + * For a zero descriptor, it may describe multiple sectors to fill with zeroes. + * In this case, it should be noted that zeroes are written to disk, and the + * image file is not extended as a sparse file. */ +static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, + VHDXLogDataSector *data) +{ + int ret = 0; + uint64_t seq, file_offset; + uint32_t offset = 0; + void *buffer = NULL; + uint64_t count = 1; + int i; + + buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + if (!memcmp(&desc->signature, "desc", 4)) { + /* data sector */ + if (data == NULL) { + ret = -EFAULT; + goto exit; + } + + /* The sequence number of the data sector must match that + * in the descriptor */ + seq = data->sequence_high; + seq <<= 32; + seq |= data->sequence_low & 0xffffffff; + + if (seq != desc->sequence_number) { + ret = -EINVAL; + goto exit; + } + + /* Each data sector is in total 4096 bytes, however the first + * 8 bytes, and last 4 bytes, are located in the descriptor */ + memcpy(buffer, &desc->leading_bytes, 8); + offset += 8; + + memcpy(buffer+offset, data->data, 4084); + offset += 4084; + + memcpy(buffer+offset, &desc->trailing_bytes, 4); + + } else if (!memcmp(&desc->signature, "zero", 4)) { + /* write 'count' sectors of sector */ + memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); + count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; + } + + file_offset = desc->file_offset; + + /* count is only > 1 if we are writing zeroes */ + for (i = 0; i < count; i++) { + ret = bdrv_pwrite_sync(bs->file, file_offset, buffer, + VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + file_offset += VHDX_LOG_SECTOR_SIZE; + } + +exit: + qemu_vfree(buffer); + return ret; +} + +/* Flush the entire log (as described by 'logs') to the VHDX image + * file, and then set the log to 'empty' status once complete. + * + * The log entries should be validate prior to flushing */ +static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogSequence *logs) +{ + int ret = 0; + int i; + uint32_t cnt, sectors_read; + uint64_t new_file_size; + void *data = NULL; + VHDXLogDescEntries *desc_entries = NULL; + VHDXLogEntryHeader hdr_tmp = { 0 }; + + cnt = logs->count; + + data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + /* each iteration represents one log sequence, which may span multiple + * sectors */ + while (cnt--) { + ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp); + if (ret < 0) { + goto exit; + } + /* if the log shows a FlushedFileOffset larger than our current file + * size, then that means the file has been truncated / corrupted, and + * we must refused to open it / use it */ + if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file)) { + ret = -EINVAL; + goto exit; + } + + ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries); + if (ret < 0) { + goto exit; + } + + for (i = 0; i < desc_entries->hdr.descriptor_count; i++) { + if (!memcmp(&desc_entries->desc[i].signature, "desc", 4)) { + /* data sector, so read a sector to flush */ + ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read, + data, 1, false); + if (ret < 0) { + goto exit; + } + if (sectors_read != 1) { + ret = -EINVAL; + goto exit; + } + } + + ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data); + if (ret < 0) { + goto exit; + } + } + if (bdrv_getlength(bs->file) < desc_entries->hdr.last_file_offset) { + new_file_size = desc_entries->hdr.last_file_offset; + if (new_file_size % (1024*1024)) { + /* round up to nearest 1MB boundary */ + new_file_size = ((new_file_size >> 20) + 1) << 20; + bdrv_truncate(bs->file, new_file_size); + } + } + qemu_vfree(desc_entries); + desc_entries = NULL; + } + + bdrv_flush(bs); + /* once the log is fully flushed, indicate that we have an empty log + * now. This also sets the log guid to 0, to indicate an empty log */ + vhdx_log_reset(bs, s); + +exit: + qemu_vfree(data); + qemu_vfree(desc_entries); + return ret; +} + +static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogEntries *log, uint64_t seq, + bool *valid, VHDXLogEntryHeader *entry) +{ + int ret = 0; + VHDXLogEntryHeader hdr; + void *buffer = NULL; + uint32_t i, desc_sectors, total_sectors, crc; + uint32_t sectors_read = 0; + VHDXLogDescEntries *desc_buffer = NULL; + + *valid = false; + + ret = vhdx_log_peek_hdr(bs, log, &hdr); + if (ret < 0) { + goto inc_and_exit; + } + + vhdx_log_entry_hdr_le_import(&hdr); + + + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { + goto inc_and_exit; + } + + if (seq > 0) { + if (hdr.sequence_number != seq + 1) { + goto inc_and_exit; + } + } + + desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); + + /* Read desc sectors, and calculate log checksum */ + + total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE; + + + /* read_desc() will increment the read idx */ + ret = vhdx_log_read_desc(bs, s, log, &desc_buffer); + if (ret < 0) { + goto free_and_exit; + } + + crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer, + desc_sectors * VHDX_LOG_SECTOR_SIZE, 4); + crc ^= 0xffffffff; + + buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + if (total_sectors > desc_sectors) { + for (i = 0; i < total_sectors - desc_sectors; i++) { + sectors_read = 0; + ret = vhdx_log_read_sectors(bs, log, §ors_read, buffer, + 1, false); + if (ret < 0 || sectors_read != 1) { + goto free_and_exit; + } + crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1); + crc ^= 0xffffffff; + } + } + crc ^= 0xffffffff; + if (crc != desc_buffer->hdr.checksum) { + goto free_and_exit; + } + + *valid = true; + *entry = hdr; + goto free_and_exit; + +inc_and_exit: + log->read = vhdx_log_inc_idx(log->read, log->length); + +free_and_exit: + qemu_vfree(buffer); + qemu_vfree(desc_buffer); + return ret; +} + +/* Search through the log circular buffer, and find the valid, active + * log sequence, if any exists + * */ +static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogSequence *logs) +{ + int ret = 0; + uint32_t tail; + bool seq_valid = false; + VHDXLogSequence candidate = { 0 }; + VHDXLogEntryHeader hdr = { 0 }; + VHDXLogEntries curr_log; + + memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries)); + curr_log.write = curr_log.length; /* assume log is full */ + curr_log.read = 0; + + + /* now we will go through the whole log sector by sector, until + * we find a valid, active log sequence, or reach the end of the + * log buffer */ + for (;;) { + uint64_t curr_seq = 0; + VHDXLogSequence current = { 0 }; + + tail = curr_log.read; + + ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, + &seq_valid, &hdr); + if (ret < 0) { + goto exit; + } + + if (seq_valid) { + current.valid = true; + current.log = curr_log; + current.log.read = tail; + current.log.write = curr_log.read; + current.count = 1; + current.hdr = hdr; + + + for (;;) { + ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, + &seq_valid, &hdr); + if (ret < 0) { + goto exit; + } + if (seq_valid == false) { + break; + } + current.log.write = curr_log.read; + current.count++; + + curr_seq = hdr.sequence_number; + } + } + + if (current.valid) { + if (candidate.valid == false || + current.hdr.sequence_number > candidate.hdr.sequence_number) { + candidate = current; + } + } + + if (curr_log.read < tail) { + break; + } + } + + *logs = candidate; + + if (candidate.valid) { + /* this is the next sequence number, for writes */ + s->log.sequence = candidate.hdr.sequence_number + 1; + } + + +exit: + return ret; +} + +/* Parse the replay log. Per the VHDX spec, if the log is present + * it must be replayed prior to opening the file, even read-only. + * + * If read-only, we must replay the log in RAM (or refuse to open + * a dirty VHDX file read-only) */ +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp) +{ + int ret = 0; + VHDXHeader *hdr; + VHDXLogSequence logs = { 0 }; + + hdr = s->headers[s->curr_header]; + + *flushed = false; + + /* s->log.hdr is freed in vhdx_close() */ + if (s->log.hdr == NULL) { + s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader)); + } + + s->log.offset = hdr->log_offset; + s->log.length = hdr->log_length; + + if (s->log.offset < VHDX_LOG_MIN_SIZE || + s->log.offset % VHDX_LOG_MIN_SIZE) { + ret = -EINVAL; + goto exit; + } + + /* per spec, only log version of 0 is supported */ + if (hdr->log_version != 0) { + ret = -EINVAL; + goto exit; + } + + /* If either the log guid, or log length is zero, + * then a replay log is not present */ + if (guid_eq(hdr->log_guid, zero_guid)) { + goto exit; + } + + if (hdr->log_length == 0) { + goto exit; + } + + if (hdr->log_length % VHDX_LOG_MIN_SIZE) { + ret = -EINVAL; + goto exit; + } + + + /* The log is present, we need to find if and where there is an active + * sequence of valid entries present in the log. */ + + ret = vhdx_log_search(bs, s, &logs); + if (ret < 0) { + goto exit; + } + + if (logs.valid) { + if (bs->read_only) { + ret = -EPERM; + error_setg_errno(errp, EPERM, + "VHDX image file '%s' opened read-only, but " + "contains a log that needs to be replayed. To " + "replay the log, execute:\n qemu-img check -r " + "all '%s'", + bs->filename, bs->filename); + goto exit; + } + /* now flush the log */ + ret = vhdx_log_flush(bs, s, &logs); + if (ret < 0) { + goto exit; + } + *flushed = true; + } + + +exit: + return ret; +} + + + +static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc, + VHDXLogDataSector *sector, void *data, + uint64_t seq) +{ + /* 8 + 4084 + 4 = 4096, 1 log sector */ + memcpy(&desc->leading_bytes, data, 8); + data += 8; + cpu_to_le64s(&desc->leading_bytes); + memcpy(sector->data, data, 4084); + data += 4084; + memcpy(&desc->trailing_bytes, data, 4); + cpu_to_le32s(&desc->trailing_bytes); + data += 4; + + sector->sequence_high = (uint32_t) (seq >> 32); + sector->sequence_low = (uint32_t) (seq & 0xffffffff); + sector->data_signature = VHDX_LOG_DATA_SIGNATURE; + + vhdx_log_desc_le_export(desc); + vhdx_log_data_le_export(sector); +} + + +static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset) +{ + int ret = 0; + void *buffer = NULL; + void *merged_sector = NULL; + void *data_tmp, *sector_write; + unsigned int i; + int sector_offset; + uint32_t desc_sectors, sectors, total_length; + uint32_t sectors_written = 0; + uint32_t aligned_length; + uint32_t leading_length = 0; + uint32_t trailing_length = 0; + uint32_t partial_sectors = 0; + uint32_t bytes_written = 0; + uint64_t file_offset; + VHDXHeader *header; + VHDXLogEntryHeader new_hdr; + VHDXLogDescriptor *new_desc = NULL; + VHDXLogDataSector *data_sector = NULL; + MSGUID new_guid = { 0 }; + + header = s->headers[s->curr_header]; + + /* need to have offset read data, and be on 4096 byte boundary */ + + if (length > header->log_length) { + /* no log present. we could create a log here instead of failing */ + ret = -EINVAL; + goto exit; + } + + if (guid_eq(header->log_guid, zero_guid)) { + vhdx_guid_generate(&new_guid); + vhdx_update_headers(bs, s, false, &new_guid); + } else { + /* currently, we require that the log be flushed after + * every write. */ + ret = -ENOTSUP; + goto exit; + } + + /* 0 is an invalid sequence number, but may also represent the first + * log write (or a wrapped seq) */ + if (s->log.sequence == 0) { + s->log.sequence = 1; + } + + sector_offset = offset % VHDX_LOG_SECTOR_SIZE; + file_offset = (offset / VHDX_LOG_SECTOR_SIZE) * VHDX_LOG_SECTOR_SIZE; + + aligned_length = length; + + /* add in the unaligned head and tail bytes */ + if (sector_offset) { + leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset); + leading_length = leading_length > length ? length : leading_length; + aligned_length -= leading_length; + partial_sectors++; + } + + sectors = aligned_length / VHDX_LOG_SECTOR_SIZE; + trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE); + if (trailing_length) { + partial_sectors++; + } + + sectors += partial_sectors; + + /* sectors is now how many sectors the data itself takes, not + * including the header and descriptor metadata */ + + new_hdr = (VHDXLogEntryHeader) { + .signature = VHDX_LOG_SIGNATURE, + .tail = s->log.tail, + .sequence_number = s->log.sequence, + .descriptor_count = sectors, + .reserved = 0, + .flushed_file_offset = bdrv_getlength(bs->file), + .last_file_offset = bdrv_getlength(bs->file), + }; + + new_hdr.log_guid = header->log_guid; + + desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count); + + total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE; + new_hdr.entry_length = total_length; + + vhdx_log_entry_hdr_le_export(&new_hdr); + + buffer = qemu_blockalign(bs, total_length); + memcpy(buffer, &new_hdr, sizeof(new_hdr)); + + new_desc = (VHDXLogDescriptor *) (buffer + sizeof(new_hdr)); + data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE); + data_tmp = data; + + /* All log sectors are 4KB, so for any partial sectors we must + * merge the data with preexisting data from the final file + * destination */ + merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + for (i = 0; i < sectors; i++) { + new_desc->signature = VHDX_LOG_DESC_SIGNATURE; + new_desc->sequence_number = s->log.sequence; + new_desc->file_offset = file_offset; + + if (i == 0 && leading_length) { + /* partial sector at the front of the buffer */ + ret = bdrv_pread(bs->file, file_offset, merged_sector, + VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + memcpy(merged_sector + sector_offset, data_tmp, leading_length); + bytes_written = leading_length; + sector_write = merged_sector; + } else if (i == sectors - 1 && trailing_length) { + /* partial sector at the end of the buffer */ + ret = bdrv_pread(bs->file, + file_offset, + merged_sector + trailing_length, + VHDX_LOG_SECTOR_SIZE - trailing_length); + if (ret < 0) { + goto exit; + } + memcpy(merged_sector, data_tmp, trailing_length); + bytes_written = trailing_length; + sector_write = merged_sector; + } else { + bytes_written = VHDX_LOG_SECTOR_SIZE; + sector_write = data_tmp; + } + + /* populate the raw sector data into the proper structures, + * as well as update the descriptor, and convert to proper + * endianness */ + vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write, + s->log.sequence); + + data_tmp += bytes_written; + data_sector++; + new_desc++; + file_offset += VHDX_LOG_SECTOR_SIZE; + } + + /* checksum covers entire entry, from the log header through the + * last data sector */ + vhdx_update_checksum(buffer, total_length, + offsetof(VHDXLogEntryHeader, checksum)); + cpu_to_le32s((uint32_t *)(buffer + 4)); + + /* now write to the log */ + ret = vhdx_log_write_sectors(bs, &s->log, §ors_written, buffer, + desc_sectors + sectors); + if (ret < 0) { + goto exit; + } + + if (sectors_written != desc_sectors + sectors) { + /* instead of failing, we could flush the log here */ + ret = -EINVAL; + goto exit; + } + + s->log.sequence++; + /* write new tail */ + s->log.tail = s->log.write; + +exit: + qemu_vfree(buffer); + qemu_vfree(merged_sector); + return ret; +} + +/* Perform a log write, and then immediately flush the entire log */ +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset) +{ + int ret = 0; + VHDXLogSequence logs = { .valid = true, + .count = 1, + .hdr = { 0 } }; + + + /* Make sure data written (new and/or changed blocks) is stable + * on disk, before creating log entry */ + bdrv_flush(bs); + ret = vhdx_log_write(bs, s, data, length, offset); + if (ret < 0) { + goto exit; + } + logs.log = s->log; + + /* Make sure log is stable on disk */ + bdrv_flush(bs); + ret = vhdx_log_flush(bs, s, &logs); + if (ret < 0) { + goto exit; + } + + s->log = logs.log; + +exit: + return ret; +} + diff --git a/block/vhdx.c b/block/vhdx.c index e9704b1fdc..509baaf484 100644 --- a/block/vhdx.c +++ b/block/vhdx.c @@ -6,9 +6,9 @@ * Authors: * Jeff Cody * - * This is based on the "VHDX Format Specification v0.95", published 4/12/2012 + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=29681 + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 * * This work is licensed under the terms of the GNU LGPL, version 2 or later. * See the COPYING.LIB file in the top-level directory. @@ -20,7 +20,22 @@ #include "qemu/module.h" #include "qemu/crc32c.h" #include "block/vhdx.h" +#include "migration/migration.h" +#include +#include + +/* Options for VHDX creation */ + +#define VHDX_BLOCK_OPT_LOG_SIZE "log_size" +#define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size" +#define VHDX_BLOCK_OPT_ZERO "block_state_zero" + +typedef enum VHDXImageType { + VHDX_TYPE_DYNAMIC = 0, + VHDX_TYPE_FIXED, + VHDX_TYPE_DIFFERENCING, /* Currently unsupported */ +} VHDXImageType; /* Several metadata and region table data entries are identified by * guids in a MS-specific GUID format. */ @@ -103,16 +118,6 @@ static const MSGUID parent_vhdx_guid = { .data1 = 0xb04aefb7, META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \ META_PHYS_SECTOR_SIZE_PRESENT) -typedef struct VHDXMetadataEntries { - VHDXMetadataTableEntry file_parameters_entry; - VHDXMetadataTableEntry virtual_disk_size_entry; - VHDXMetadataTableEntry page83_data_entry; - VHDXMetadataTableEntry logical_sector_size_entry; - VHDXMetadataTableEntry phys_sector_size_entry; - VHDXMetadataTableEntry parent_locator_entry; - uint16_t present; -} VHDXMetadataEntries; - typedef struct VHDXSectorInfo { uint32_t bat_idx; /* BAT entry index */ @@ -123,43 +128,31 @@ typedef struct VHDXSectorInfo { uint64_t block_offset; /* block offset, in bytes */ } VHDXSectorInfo; +/* Calculates new checksum. + * + * Zero is substituted during crc calculation for the original crc field + * crc_offset: byte offset in buf of the buffer crc + * buf: buffer pointer + * size: size of buffer (must be > crc_offset+4) + * + * Note: The resulting checksum is in the CPU endianness, not necessarily + * in the file format endianness (LE). Any header export to disk should + * make sure that vhdx_header_le_export() is used to convert to the + * correct endianness + */ +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) +{ + uint32_t crc; + assert(buf != NULL); + assert(size > (crc_offset + sizeof(crc))); -typedef struct BDRVVHDXState { - CoMutex lock; - - int curr_header; - VHDXHeader *headers[2]; - - VHDXRegionTableHeader rt; - VHDXRegionTableEntry bat_rt; /* region table for the BAT */ - VHDXRegionTableEntry metadata_rt; /* region table for the metadata */ - - VHDXMetadataTableHeader metadata_hdr; - VHDXMetadataEntries metadata_entries; - - VHDXFileParameters params; - uint32_t block_size; - uint32_t block_size_bits; - uint32_t sectors_per_block; - uint32_t sectors_per_block_bits; - - uint64_t virtual_disk_size; - uint32_t logical_sector_size; - uint32_t physical_sector_size; - - uint64_t chunk_ratio; - uint32_t chunk_ratio_bits; - uint32_t logical_sector_size_bits; - - uint32_t bat_entries; - VHDXBatEntry *bat; - uint64_t bat_offset; - - VHDXParentLocatorHeader parent_header; - VHDXParentLocatorEntry *parent_entries; + memset(buf + crc_offset, 0, sizeof(crc)); + crc = crc32c(0xffffffff, buf, size); + memcpy(buf + crc_offset, &crc, sizeof(crc)); -} BDRVVHDXState; + return crc; +} uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, int crc_offset) @@ -211,6 +204,71 @@ bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset) } +/* + * This generates a UUID that is compliant with the MS GUIDs used + * in the VHDX spec (and elsewhere). + */ +void vhdx_guid_generate(MSGUID *guid) +{ + uuid_t uuid; + assert(guid != NULL); + + uuid_generate(uuid); + memcpy(guid, uuid, sizeof(MSGUID)); +} + +/* Check for region overlaps inside the VHDX image */ +static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length) +{ + int ret = 0; + uint64_t end; + VHDXRegionEntry *r; + + end = start + length; + QLIST_FOREACH(r, &s->regions, entries) { + if (!((start >= r->end) || (end <= r->start))) { + ret = -EINVAL; + goto exit; + } + } + +exit: + return ret; +} + +/* Register a region for future checks */ +static void vhdx_region_register(BDRVVHDXState *s, + uint64_t start, uint64_t length) +{ + VHDXRegionEntry *r; + + r = g_malloc0(sizeof(*r)); + + r->start = start; + r->end = start + length; + + QLIST_INSERT_HEAD(&s->regions, r, entries); +} + +/* Free all registered regions */ +static void vhdx_region_unregister_all(BDRVVHDXState *s) +{ + VHDXRegionEntry *r, *r_next; + + QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) { + QLIST_REMOVE(r, entries); + g_free(r); + } +} + +static void vhdx_set_shift_bits(BDRVVHDXState *s) +{ + s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); + s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); + s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); + s->block_size_bits = 31 - clz32(s->block_size); +} + /* * Per the MS VHDX Specification, for every VHDX file: * - The header section is fixed size - 1 MB @@ -230,30 +288,124 @@ static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -/* All VHDX structures on disk are little endian */ -static void vhdx_header_le_import(VHDXHeader *h) +/* + * Writes the header to the specified offset. + * + * This will optionally read in buffer data from disk (otherwise zero-fill), + * and then update the header checksum. Header is converted to proper + * endianness before being written to the specified file offset + */ +static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, + uint64_t offset, bool read) +{ + uint8_t *buffer = NULL; + int ret; + VHDXHeader header_le; + + assert(bs_file != NULL); + assert(hdr != NULL); + + /* the header checksum is not over just the packed size of VHDXHeader, + * but rather over the entire 'reserved' range for the header, which is + * 4KB (VHDX_HEADER_SIZE). */ + + buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE); + if (read) { + /* if true, we can't assume the extra reserved bytes are 0 */ + ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE); + if (ret < 0) { + goto exit; + } + } else { + memset(buffer, 0, VHDX_HEADER_SIZE); + } + + /* overwrite the actual VHDXHeader portion */ + memcpy(buffer, hdr, sizeof(VHDXHeader)); + hdr->checksum = vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, + offsetof(VHDXHeader, checksum)); + vhdx_header_le_export(hdr, &header_le); + ret = bdrv_pwrite_sync(bs_file, offset, &header_le, sizeof(VHDXHeader)); + +exit: + qemu_vfree(buffer); + return ret; +} + +/* Update the VHDX headers + * + * This follows the VHDX spec procedures for header updates. + * + * - non-current header is updated with largest sequence number + */ +static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s, + bool generate_data_write_guid, MSGUID *log_guid) { - assert(h != NULL); + int ret = 0; + int hdr_idx = 0; + uint64_t header_offset = VHDX_HEADER1_OFFSET; + + VHDXHeader *active_header; + VHDXHeader *inactive_header; + + /* operate on the non-current header */ + if (s->curr_header == 0) { + hdr_idx = 1; + header_offset = VHDX_HEADER2_OFFSET; + } + + active_header = s->headers[s->curr_header]; + inactive_header = s->headers[hdr_idx]; - le32_to_cpus(&h->signature); - le32_to_cpus(&h->checksum); - le64_to_cpus(&h->sequence_number); + inactive_header->sequence_number = active_header->sequence_number + 1; - leguid_to_cpus(&h->file_write_guid); - leguid_to_cpus(&h->data_write_guid); - leguid_to_cpus(&h->log_guid); + /* a new file guid must be generated before any file write, including + * headers */ + inactive_header->file_write_guid = s->session_guid; + + /* a new data guid only needs to be generated before any guest-visible + * writes (i.e. something observable via virtual disk read) */ + if (generate_data_write_guid) { + vhdx_guid_generate(&inactive_header->data_write_guid); + } - le16_to_cpus(&h->log_version); - le16_to_cpus(&h->version); - le32_to_cpus(&h->log_length); - le64_to_cpus(&h->log_offset); + /* update the log guid if present */ + if (log_guid) { + inactive_header->log_guid = *log_guid; + } + + ret = vhdx_write_header(bs->file, inactive_header, header_offset, true); + if (ret < 0) { + goto exit; + } + s->curr_header = hdr_idx; + +exit: + return ret; } +/* + * The VHDX spec calls for header updates to be performed twice, so that both + * the current and non-current header have valid info + */ +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, + bool generate_data_write_guid, MSGUID *log_guid) +{ + int ret; + + ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + if (ret < 0) { + return ret; + } + ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + return ret; +} /* opens the specified header block from the VHDX file header section */ -static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) +static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, + Error **errp) { - int ret = 0; + int ret; VHDXHeader *header1; VHDXHeader *header2; bool h1_valid = false; @@ -262,6 +414,7 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) uint64_t h2_seq = 0; uint8_t *buffer; + /* header1 & header2 are freed in vhdx_close() */ header1 = qemu_blockalign(bs, sizeof(VHDXHeader)); header2 = qemu_blockalign(bs, sizeof(VHDXHeader)); @@ -310,7 +463,6 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) } else if (!h1_valid && h2_valid) { s->curr_header = 1; } else if (!h1_valid && !h2_valid) { - ret = -EINVAL; goto fail; } else { /* If both headers are valid, then we choose the active one by the @@ -321,24 +473,22 @@ static int vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s) } else if (h2_seq > h1_seq) { s->curr_header = 1; } else { - ret = -EINVAL; goto fail; } } - ret = 0; - + vhdx_region_register(s, s->headers[s->curr_header]->log_offset, + s->headers[s->curr_header]->log_length); goto exit; fail: - qerror_report(ERROR_CLASS_GENERIC_ERROR, "No valid VHDX header found"); + error_setg_errno(errp, -ret, "No valid VHDX header found"); qemu_vfree(header1); qemu_vfree(header2); s->headers[0] = NULL; s->headers[1] = NULL; exit: qemu_vfree(buffer); - return ret; } @@ -362,10 +512,7 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) goto fail; } memcpy(&s->rt, buffer, sizeof(s->rt)); - le32_to_cpus(&s->rt.signature); - le32_to_cpus(&s->rt.checksum); - le32_to_cpus(&s->rt.entry_count); - le32_to_cpus(&s->rt.reserved); + vhdx_region_header_le_import(&s->rt); offset += sizeof(s->rt); if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4) || @@ -384,10 +531,16 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) memcpy(&rt_entry, buffer + offset, sizeof(rt_entry)); offset += sizeof(rt_entry); - leguid_to_cpus(&rt_entry.guid); - le64_to_cpus(&rt_entry.file_offset); - le32_to_cpus(&rt_entry.length); - le32_to_cpus(&rt_entry.data_bits); + vhdx_region_entry_le_import(&rt_entry); + + /* check for region overlap between these entries, and any + * other memory regions in the file */ + ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length); + if (ret < 0) { + goto fail; + } + + vhdx_region_register(s, rt_entry.file_offset, rt_entry.length); /* see if we recognize the entry */ if (guid_eq(rt_entry.guid, bat_guid)) { @@ -419,6 +572,12 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) goto fail; } } + + if (!bat_rt_found || !metadata_rt_found) { + ret = -EINVAL; + goto fail; + } + ret = 0; fail: @@ -462,9 +621,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr)); offset += sizeof(s->metadata_hdr); - le64_to_cpus(&s->metadata_hdr.signature); - le16_to_cpus(&s->metadata_hdr.reserved); - le16_to_cpus(&s->metadata_hdr.entry_count); + vhdx_metadata_header_le_import(&s->metadata_hdr); if (memcmp(&s->metadata_hdr.signature, "metadata", 8)) { ret = -EINVAL; @@ -483,11 +640,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) memcpy(&md_entry, buffer + offset, sizeof(md_entry)); offset += sizeof(md_entry); - leguid_to_cpus(&md_entry.item_id); - le32_to_cpus(&md_entry.offset); - le32_to_cpus(&md_entry.length); - le32_to_cpus(&md_entry.data_bits); - le32_to_cpus(&md_entry.reserved2); + vhdx_metadata_entry_le_import(&md_entry); if (guid_eq(md_entry.item_id, file_param_guid)) { if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) { @@ -627,12 +780,20 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) le32_to_cpus(&s->logical_sector_size); le32_to_cpus(&s->physical_sector_size); - if (s->logical_sector_size == 0 || s->params.block_size == 0) { + if (s->params.block_size < VHDX_BLOCK_SIZE_MIN || + s->params.block_size > VHDX_BLOCK_SIZE_MAX) { ret = -EINVAL; goto exit; } - /* both block_size and sector_size are guaranteed powers of 2 */ + /* only 2 supported sector sizes */ + if (s->logical_sector_size != 512 && s->logical_sector_size != 4096) { + ret = -EINVAL; + goto exit; + } + + /* Both block_size and sector_size are guaranteed powers of 2, below. + Due to range checks above, s->sectors_per_block can never be < 256 */ s->sectors_per_block = s->params.block_size / s->logical_sector_size; s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * (uint64_t)s->logical_sector_size / @@ -660,10 +821,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) goto exit; } - s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); - s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); - s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); - s->block_size_bits = 31 - clz32(s->block_size); + vhdx_set_shift_bits(s); ret = 0; @@ -672,61 +830,64 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) return ret; } -/* Parse the replay log. Per the VHDX spec, if the log is present - * it must be replayed prior to opening the file, even read-only. - * - * If read-only, we must replay the log in RAM (or refuse to open - * a dirty VHDX file read-only */ -static int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s) +/* + * Calculate the number of BAT entries, including sector + * bitmap entries. + */ +static void vhdx_calc_bat_entries(BDRVVHDXState *s) { - int ret = 0; - int i; - VHDXHeader *hdr; - - hdr = s->headers[s->curr_header]; + uint32_t data_blocks_cnt, bitmap_blocks_cnt; - /* either the log guid, or log length is zero, - * then a replay log is present */ - for (i = 0; i < sizeof(hdr->log_guid.data4); i++) { - ret |= hdr->log_guid.data4[i]; - } - if (hdr->log_guid.data1 == 0 && - hdr->log_guid.data2 == 0 && - hdr->log_guid.data3 == 0 && - ret == 0) { - goto exit; + data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; + if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { + data_blocks_cnt++; } - - /* per spec, only log version of 0 is supported */ - if (hdr->log_version != 0) { - ret = -EINVAL; - goto exit; + bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; + if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { + bitmap_blocks_cnt++; } - if (hdr->log_length == 0) { - goto exit; + if (s->parent_entries) { + s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); + } else { + s->bat_entries = data_blocks_cnt + + ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); } - /* We currently do not support images with logs to replay */ - ret = -ENOTSUP; - -exit: - return ret; } +static void vhdx_close(BlockDriverState *bs) +{ + BDRVVHDXState *s = bs->opaque; + qemu_vfree(s->headers[0]); + s->headers[0] = NULL; + qemu_vfree(s->headers[1]); + s->headers[1] = NULL; + qemu_vfree(s->bat); + s->bat = NULL; + qemu_vfree(s->parent_entries); + s->parent_entries = NULL; + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); + qemu_vfree(s->log.hdr); + s->log.hdr = NULL; + vhdx_region_unregister_all(s); +} -static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) +static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVHDXState *s = bs->opaque; int ret = 0; uint32_t i; uint64_t signature; - uint32_t data_blocks_cnt, bitmap_blocks_cnt; - + Error *local_err = NULL; s->bat = NULL; + s->first_visible_write = true; qemu_co_mutex_init(&s->lock); + QLIST_INIT(&s->regions); /* validate the file signature */ ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t)); @@ -738,46 +899,40 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - ret = vhdx_parse_header(bs, s); - if (ret) { + /* This is used for any header updates, for the file_write_guid. + * The spec dictates that a new value should be used for the first + * header update */ + vhdx_guid_generate(&s->session_guid); + + vhdx_parse_header(bs, s, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; goto fail; } - ret = vhdx_parse_log(bs, s); - if (ret) { + ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp); + if (ret < 0) { goto fail; } ret = vhdx_open_region_tables(bs, s); - if (ret) { + if (ret < 0) { goto fail; } ret = vhdx_parse_metadata(bs, s); - if (ret) { + if (ret < 0) { goto fail; } + s->block_size = s->params.block_size; /* the VHDX spec dictates that virtual_disk_size is always a multiple of * logical_sector_size */ bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits; - data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; - if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { - data_blocks_cnt++; - } - bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; - if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { - bitmap_blocks_cnt++; - } - - if (s->parent_entries) { - s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); - } else { - s->bat_entries = data_blocks_cnt + - ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); - } + vhdx_calc_bat_entries(s); s->bat_offset = s->bat_rt.file_offset; @@ -787,6 +942,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } + /* s->bat is freed in vhdx_close() */ s->bat = qemu_blockalign(bs, s->bat_rt.length); ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); @@ -794,23 +950,46 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } + uint64_t payblocks = s->chunk_ratio; + /* endian convert, and verify populated BAT field file offsets against + * region table and log entries */ for (i = 0; i < s->bat_entries; i++) { le64_to_cpus(&s->bat[i]); + if (payblocks--) { + /* payload bat entries */ + if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) == + PAYLOAD_BLOCK_FULLY_PRESENT) { + ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK, + s->block_size); + if (ret < 0) { + goto fail; + } + } + } else { + payblocks = s->chunk_ratio; + /* Once differencing files are supported, verify sector bitmap + * blocks here */ + } } if (flags & BDRV_O_RDWR) { - ret = -ENOTSUP; - goto fail; + ret = vhdx_update_headers(bs, s, false, NULL); + if (ret < 0) { + goto fail; + } } - /* TODO: differencing files, write */ + /* TODO: differencing files */ + + /* Disable migration when VHDX images are used */ + error_set(&s->migration_blocker, + QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + "vhdx", bs->device_name, "live migration"); + migrate_add_blocker(s->migration_blocker); return 0; fail: - qemu_vfree(s->headers[0]); - qemu_vfree(s->headers[1]); - qemu_vfree(s->bat); - qemu_vfree(s->parent_entries); + vhdx_close(bs); return ret; } @@ -850,7 +1029,7 @@ static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits; - sinfo->file_offset = s->bat[sinfo->bat_idx] >> VHDX_BAT_FILE_OFF_BITS; + sinfo->file_offset = s->bat[sinfo->bat_idx] & VHDX_BAT_FILE_OFF_MASK; sinfo->block_offset = block_offset << s->logical_sector_size_bits; @@ -864,11 +1043,22 @@ static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, * in the block, and add in the payload data block offset * in the file, in bytes, to get the final read address */ - sinfo->file_offset <<= 20; /* now in bytes, rather than 1MB units */ sinfo->file_offset += sinfo->block_offset; } +static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVVHDXState *s = bs->opaque; + + bdi->cluster_size = s->block_size; + + bdi->unallocated_blocks_are_zero = + (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0; + + return 0; +} + static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) @@ -905,7 +1095,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, /* return zero */ qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail); break; - case PAYLOAD_BLOCK_FULL_PRESENT: + case PAYLOAD_BLOCK_FULLY_PRESENT: qemu_co_mutex_unlock(&s->lock); ret = bdrv_co_readv(bs->file, sinfo.file_offset >> BDRV_SECTOR_BITS, @@ -935,24 +1125,792 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, return ret; } +/* + * Allocate a new payload block at the end of the file. + * + * Allocation will happen at 1MB alignment inside the file + * + * Returns the file offset start of the new payload block + */ +static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, + uint64_t *new_offset) +{ + *new_offset = bdrv_getlength(bs->file); + + /* per the spec, the address for a block is in units of 1MB */ + *new_offset = ROUND_UP(*new_offset, 1024 * 1024); + + return bdrv_truncate(bs->file, *new_offset + s->block_size); +} + +/* + * Update the BAT table entry with the new file offset, and the new entry + * state */ +static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s, + VHDXSectorInfo *sinfo, + uint64_t *bat_entry_le, + uint64_t *bat_offset, int state) +{ + /* The BAT entry is a uint64, with 44 bits for the file offset in units of + * 1MB, and 3 bits for the block state. */ + s->bat[sinfo->bat_idx] = sinfo->file_offset; + + s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK; + *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]); + *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry); + +} + +/* Per the spec, on the first write of guest-visible data to the file the + * data write guid must be updated in the header */ +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + if (s->first_visible_write) { + s->first_visible_write = false; + ret = vhdx_update_headers(bs, s, true, NULL); + } + return ret; +} static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return -ENOTSUP; + int ret = -ENOTSUP; + BDRVVHDXState *s = bs->opaque; + VHDXSectorInfo sinfo; + uint64_t bytes_done = 0; + uint64_t bat_entry = 0; + uint64_t bat_entry_offset = 0; + QEMUIOVector hd_qiov; + struct iovec iov1 = { 0 }; + struct iovec iov2 = { 0 }; + int sectors_to_write; + int bat_state; + uint64_t bat_prior_offset = 0; + bool bat_update = false; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + while (nb_sectors > 0) { + bool use_zero_buffers = false; + bat_update = false; + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + /* not supported yet */ + ret = -ENOTSUP; + goto exit; + } else { + vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); + sectors_to_write = sinfo.sectors_avail; + + qemu_iovec_reset(&hd_qiov); + /* check the payload block state */ + bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK; + switch (bat_state) { + case PAYLOAD_BLOCK_ZERO: + /* in this case, we need to preserve zero writes for + * data that is not part of this write, so we must pad + * the rest of the buffer to zeroes */ + + /* if we are on a posix system with ftruncate() that extends + * a file, then it is zero-filled for us. On Win32, the raw + * layer uses SetFilePointer and SetFileEnd, which does not + * zero fill AFAIK */ + + /* Queue another write of zero buffers if the underlying file + * does not zero-fill on file extension */ + + if (bdrv_has_zero_init(bs->file) == 0) { + use_zero_buffers = true; + + /* zero fill the front, if any */ + if (sinfo.block_offset) { + iov1.iov_len = sinfo.block_offset; + iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); + memset(iov1.iov_base, 0, iov1.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, + sinfo.block_offset); + sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; + } + + /* our actual data */ + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + + /* zero fill the back, if any */ + if ((sinfo.bytes_avail - sinfo.block_offset) < + s->block_size) { + iov2.iov_len = s->block_size - + (sinfo.bytes_avail + sinfo.block_offset); + iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); + memset(iov2.iov_base, 0, iov2.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, + sinfo.block_offset); + sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; + } + } + + /* fall through */ + case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ + case PAYLOAD_BLOCK_UNMAPPED: /* fall through */ + case PAYLOAD_BLOCK_UNDEFINED: /* fall through */ + bat_prior_offset = sinfo.file_offset; + ret = vhdx_allocate_block(bs, s, &sinfo.file_offset); + if (ret < 0) { + goto exit; + } + /* once we support differencing files, this may also be + * partially present */ + /* update block state to the newly specified state */ + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, + PAYLOAD_BLOCK_FULLY_PRESENT); + bat_update = true; + /* since we just allocated a block, file_offset is the + * beginning of the payload block. It needs to be the + * write address, which includes the offset into the block */ + if (!use_zero_buffers) { + sinfo.file_offset += sinfo.block_offset; + } + /* fall through */ + case PAYLOAD_BLOCK_FULLY_PRESENT: + /* if the file offset address is in the header zone, + * there is a problem */ + if (sinfo.file_offset < (1024 * 1024)) { + ret = -EFAULT; + goto error_bat_restore; + } + + if (!use_zero_buffers) { + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + } + /* block exists, so we can just overwrite it */ + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file, + sinfo.file_offset >> BDRV_SECTOR_BITS, + sectors_to_write, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto error_bat_restore; + } + break; + case PAYLOAD_BLOCK_PARTIALLY_PRESENT: + /* we don't yet support difference files, fall through + * to error */ + default: + ret = -EIO; + goto exit; + break; + } + + if (bat_update) { + /* this will update the BAT entry into the log journal, and + * then flush the log journal out to disk */ + ret = vhdx_log_write_and_flush(bs, s, &bat_entry, + sizeof(VHDXBatEntry), + bat_entry_offset); + if (ret < 0) { + goto exit; + } + } + + nb_sectors -= sinfo.sectors_avail; + sector_num += sinfo.sectors_avail; + bytes_done += sinfo.bytes_avail; + + } + } + + goto exit; + +error_bat_restore: + if (bat_update) { + /* keep metadata in sync, and restore the bat entry state + * if error. */ + sinfo.file_offset = bat_prior_offset; + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, bat_state); + } +exit: + qemu_vfree(iov1.iov_base); + qemu_vfree(iov2.iov_base); + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&hd_qiov); + return ret; } -static void vhdx_close(BlockDriverState *bs) + +/* + * Create VHDX Headers + * + * There are 2 headers, and the highest sequence number will represent + * the active header + */ +static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size, + uint32_t log_size) +{ + int ret = 0; + VHDXHeader *hdr = NULL; + + hdr = g_malloc0(sizeof(VHDXHeader)); + + hdr->signature = VHDX_HEADER_SIGNATURE; + hdr->sequence_number = g_random_int(); + hdr->log_version = 0; + hdr->version = 1; + hdr->log_length = log_size; + hdr->log_offset = VHDX_HEADER_SECTION_END; + vhdx_guid_generate(&hdr->file_write_guid); + vhdx_guid_generate(&hdr->data_write_guid); + + ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false); + if (ret < 0) { + goto exit; + } + hdr->sequence_number++; + ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false); + if (ret < 0) { + goto exit; + } + +exit: + g_free(hdr); + return ret; +} + + +/* + * Create the Metadata entries. + * + * For more details on the entries, see section 3.5 (pg 29) in the + * VHDX 1.00 specification. + * + * We support 5 metadata entries (all required by spec): + * File Parameters, + * Virtual Disk Size, + * Page 83 Data, + * Logical Sector Size, + * Physical Sector Size + * + * The first 64KB of the Metadata section is reserved for the metadata + * header and entries; beyond that, the metadata items themselves reside. + */ +static int vhdx_create_new_metadata(BlockDriverState *bs, + uint64_t image_size, + uint32_t block_size, + uint32_t sector_size, + uint64_t metadata_offset, + VHDXImageType type) +{ + int ret = 0; + uint32_t offset = 0; + void *buffer = NULL; + void *entry_buffer; + VHDXMetadataTableHeader *md_table;; + VHDXMetadataTableEntry *md_table_entry; + + /* Metadata entries */ + VHDXFileParameters *mt_file_params; + VHDXVirtualDiskSize *mt_virtual_size; + VHDXPage83Data *mt_page83; + VHDXVirtualDiskLogicalSectorSize *mt_log_sector_size; + VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size; + + entry_buffer = g_malloc0(sizeof(VHDXFileParameters) + + sizeof(VHDXVirtualDiskSize) + + sizeof(VHDXPage83Data) + + sizeof(VHDXVirtualDiskLogicalSectorSize) + + sizeof(VHDXVirtualDiskPhysicalSectorSize)); + + mt_file_params = entry_buffer; + offset += sizeof(VHDXFileParameters); + mt_virtual_size = entry_buffer + offset; + offset += sizeof(VHDXVirtualDiskSize); + mt_page83 = entry_buffer + offset; + offset += sizeof(VHDXPage83Data); + mt_log_sector_size = entry_buffer + offset; + offset += sizeof(VHDXVirtualDiskLogicalSectorSize); + mt_phys_sector_size = entry_buffer + offset; + + mt_file_params->block_size = cpu_to_le32(block_size); + if (type == VHDX_TYPE_FIXED) { + mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED; + cpu_to_le32s(&mt_file_params->data_bits); + } + + vhdx_guid_generate(&mt_page83->page_83_data); + cpu_to_leguids(&mt_page83->page_83_data); + mt_virtual_size->virtual_disk_size = cpu_to_le64(image_size); + mt_log_sector_size->logical_sector_size = cpu_to_le32(sector_size); + mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size); + + buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); + md_table = buffer; + + md_table->signature = VHDX_METADATA_SIGNATURE; + md_table->entry_count = 5; + vhdx_metadata_header_le_export(md_table); + + + /* This will reference beyond the reserved table portion */ + offset = 64 * KiB; + + md_table_entry = buffer + sizeof(VHDXMetadataTableHeader); + + md_table_entry[0].item_id = file_param_guid; + md_table_entry[0].offset = offset; + md_table_entry[0].length = sizeof(VHDXFileParameters); + md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED; + offset += md_table_entry[0].length; + vhdx_metadata_entry_le_export(&md_table_entry[0]); + + md_table_entry[1].item_id = virtual_size_guid; + md_table_entry[1].offset = offset; + md_table_entry[1].length = sizeof(VHDXVirtualDiskSize); + md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[1].length; + vhdx_metadata_entry_le_export(&md_table_entry[1]); + + md_table_entry[2].item_id = page83_guid; + md_table_entry[2].offset = offset; + md_table_entry[2].length = sizeof(VHDXPage83Data); + md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[2].length; + vhdx_metadata_entry_le_export(&md_table_entry[2]); + + md_table_entry[3].item_id = logical_sector_guid; + md_table_entry[3].offset = offset; + md_table_entry[3].length = sizeof(VHDXVirtualDiskLogicalSectorSize); + md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[3].length; + vhdx_metadata_entry_le_export(&md_table_entry[3]); + + md_table_entry[4].item_id = phys_sector_guid; + md_table_entry[4].offset = offset; + md_table_entry[4].length = sizeof(VHDXVirtualDiskPhysicalSectorSize); + md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + vhdx_metadata_entry_le_export(&md_table_entry[4]); + + ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + +exit: + g_free(buffer); + g_free(entry_buffer); + return ret; +} + +/* This create the actual BAT itself. We currently only support + * 'Dynamic' and 'Fixed' image types. + * + * Dynamic images: default state of the BAT is all zeroes. + * + * Fixed images: default state of the BAT is fully populated, with + * file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT. + */ +static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, + uint64_t image_size, VHDXImageType type, + bool use_zero_blocks, VHDXRegionTableEntry *rt_bat) +{ + int ret = 0; + uint64_t data_file_offset; + uint64_t total_sectors = 0; + uint64_t sector_num = 0; + uint64_t unused; + int block_state; + VHDXSectorInfo sinfo; + + assert(s->bat == NULL); + + /* this gives a data start after BAT/bitmap entries, and well + * past any metadata entries (with a 4 MB buffer for future + * expansion */ + data_file_offset = rt_bat->file_offset + rt_bat->length + 5 * MiB; + total_sectors = image_size >> s->logical_sector_size_bits; + + if (type == VHDX_TYPE_DYNAMIC) { + /* All zeroes, so we can just extend the file - the end of the BAT + * is the furthest thing we have written yet */ + ret = bdrv_truncate(bs, data_file_offset); + if (ret < 0) { + goto exit; + } + } else if (type == VHDX_TYPE_FIXED) { + ret = bdrv_truncate(bs, data_file_offset + image_size); + if (ret < 0) { + goto exit; + } + } else { + ret = -ENOTSUP; + goto exit; + } + + if (type == VHDX_TYPE_FIXED || + use_zero_blocks || + bdrv_has_zero_init(bs) == 0) { + /* for a fixed file, the default BAT entry is not zero */ + s->bat = g_malloc0(rt_bat->length); + block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : + PAYLOAD_BLOCK_NOT_PRESENT; + block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; + /* fill the BAT by emulating sector writes of sectors_per_block size */ + while (sector_num < total_sectors) { + vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo); + sinfo.file_offset = data_file_offset + + (sector_num << s->logical_sector_size_bits); + sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB); + vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused, + block_state); + cpu_to_le64s(&s->bat[sinfo.bat_idx]); + sector_num += s->sectors_per_block; + } + ret = bdrv_pwrite(bs, rt_bat->file_offset, s->bat, rt_bat->length); + if (ret < 0) { + goto exit; + } + } + + + +exit: + g_free(s->bat); + return ret; +} + +/* Creates the region table header, and region table entries. + * There are 2 supported region table entries: BAT, and Metadata/ + * + * As the calculations for the BAT region table are also needed + * to create the BAT itself, we will also cause the BAT to be + * created. + */ +static int vhdx_create_new_region_table(BlockDriverState *bs, + uint64_t image_size, + uint32_t block_size, + uint32_t sector_size, + uint32_t log_size, + bool use_zero_blocks, + VHDXImageType type, + uint64_t *metadata_offset) +{ + int ret = 0; + uint32_t offset = 0; + void *buffer = NULL; + BDRVVHDXState *s = NULL; + VHDXRegionTableHeader *region_table; + VHDXRegionTableEntry *rt_bat; + VHDXRegionTableEntry *rt_metadata; + + assert(metadata_offset != NULL); + + /* Populate enough of the BDRVVHDXState to be able to use the + * pre-existing BAT calculation, translation, and update functions */ + s = g_malloc0(sizeof(BDRVVHDXState)); + + s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * + (uint64_t) sector_size / (uint64_t) block_size; + + s->sectors_per_block = block_size / sector_size; + s->virtual_disk_size = image_size; + s->block_size = block_size; + s->logical_sector_size = sector_size; + + vhdx_set_shift_bits(s); + + vhdx_calc_bat_entries(s); + + /* At this point the VHDX state is populated enough for creation */ + + /* a single buffer is used so we can calculate the checksum over the + * entire 64KB block */ + buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); + region_table = buffer; + offset += sizeof(VHDXRegionTableHeader); + rt_bat = buffer + offset; + offset += sizeof(VHDXRegionTableEntry); + rt_metadata = buffer + offset; + + region_table->signature = VHDX_REGION_SIGNATURE; + region_table->entry_count = 2; /* BAT and Metadata */ + + rt_bat->guid = bat_guid; + rt_bat->length = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB); + rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB); + s->bat_offset = rt_bat->file_offset; + + rt_metadata->guid = metadata_guid; + rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length, + MiB); + rt_metadata->length = 1 * MiB; /* min size, and more than enough */ + *metadata_offset = rt_metadata->file_offset; + + vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, + offsetof(VHDXRegionTableHeader, checksum)); + + + /* The region table gives us the data we need to create the BAT, + * so do that now */ + ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, rt_bat); + + /* Now write out the region headers to disk */ + vhdx_region_header_le_export(region_table); + vhdx_region_entry_le_export(rt_bat); + vhdx_region_entry_le_export(rt_metadata); + + ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + +exit: + g_free(s); + g_free(buffer); + return ret; +} + +/* We need to create the following elements: + * + * .-----------------------------------------------------------------. + * | (A) | (B) | (C) | (D) | (E) | + * | File ID | Header1 | Header 2 | Region Tbl 1 | Region Tbl 2 | + * | | | | | | + * .-----------------------------------------------------------------. + * 0 64KB 128KB 192KB 256KB 320KB + * + * + * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + * | (F) | (G) | (H) | | + * | Journal Log | BAT / Bitmap | Metadata | .... data ...... | + * | | | | | + * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + * 1MB + */ +static int vhdx_create(const char *filename, QEMUOptionParameter *options, + Error **errp) +{ + int ret = 0; + uint64_t image_size = (uint64_t) 2 * GiB; + uint32_t log_size = 1 * MiB; + uint32_t block_size = 0; + uint64_t signature; + uint64_t metadata_offset; + bool use_zero_blocks = false; + + gunichar2 *creator = NULL; + glong creator_items; + BlockDriverState *bs; + const char *type = NULL; + VHDXImageType image_type; + Error *local_err = NULL; + + while (options && options->name) { + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { + image_size = options->value.n; + } else if (!strcmp(options->name, VHDX_BLOCK_OPT_LOG_SIZE)) { + log_size = options->value.n; + } else if (!strcmp(options->name, VHDX_BLOCK_OPT_BLOCK_SIZE)) { + block_size = options->value.n; + } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) { + type = options->value.s; + } else if (!strcmp(options->name, VHDX_BLOCK_OPT_ZERO)) { + use_zero_blocks = options->value.n != 0; + } + options++; + } + + if (image_size > VHDX_MAX_IMAGE_SIZE) { + error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB"); + ret = -EINVAL; + goto exit; + } + + if (type == NULL) { + type = "dynamic"; + } + + if (!strcmp(type, "dynamic")) { + image_type = VHDX_TYPE_DYNAMIC; + } else if (!strcmp(type, "fixed")) { + image_type = VHDX_TYPE_FIXED; + } else if (!strcmp(type, "differencing")) { + error_setg_errno(errp, ENOTSUP, + "Differencing files not yet supported"); + ret = -ENOTSUP; + goto exit; + } else { + ret = -EINVAL; + goto exit; + } + + /* These are pretty arbitrary, and mainly designed to keep the BAT + * size reasonable to load into RAM */ + if (block_size == 0) { + if (image_size > 32 * TiB) { + block_size = 64 * MiB; + } else if (image_size > (uint64_t) 100 * GiB) { + block_size = 32 * MiB; + } else if (image_size > 1 * GiB) { + block_size = 16 * MiB; + } else { + block_size = 8 * MiB; + } + } + + + /* make the log size close to what was specified, but must be + * min 1MB, and multiple of 1MB */ + log_size = ROUND_UP(log_size, MiB); + + block_size = ROUND_UP(block_size, MiB); + block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX : + block_size; + + ret = bdrv_create_file(filename, options, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + /* Create (A) */ + + /* The creator field is optional, but may be useful for + * debugging / diagnostics */ + creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, + &creator_items, NULL); + signature = cpu_to_le64(VHDX_FILE_SIGNATURE); + ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + if (ret < 0) { + goto delete_and_exit; + } + if (creator) { + ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), + creator, creator_items * sizeof(gunichar2)); + if (ret < 0) { + goto delete_and_exit; + } + } + + + /* Creates (B),(C) */ + ret = vhdx_create_new_headers(bs, image_size, log_size); + if (ret < 0) { + goto delete_and_exit; + } + + /* Creates (D),(E),(G) explicitly. (F) created as by-product */ + ret = vhdx_create_new_region_table(bs, image_size, block_size, 512, + log_size, use_zero_blocks, image_type, + &metadata_offset); + if (ret < 0) { + goto delete_and_exit; + } + + /* Creates (H) */ + ret = vhdx_create_new_metadata(bs, image_size, block_size, 512, + metadata_offset, image_type); + if (ret < 0) { + goto delete_and_exit; + } + + + +delete_and_exit: + bdrv_unref(bs); +exit: + g_free(creator); + return ret; +} + +/* If opened r/w, the VHDX driver will automatically replay the log, + * if one is present, inside the vhdx_open() call. + * + * If qemu-img check -r all is called, the image is automatically opened + * r/w and any log has already been replayed, so there is nothing (currently) + * for us to do here + */ +static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) { BDRVVHDXState *s = bs->opaque; - qemu_vfree(s->headers[0]); - qemu_vfree(s->headers[1]); - qemu_vfree(s->bat); - qemu_vfree(s->parent_entries); + + if (s->log_replayed_on_open) { + result->corruptions_fixed++; + } + return 0; } +static QEMUOptionParameter vhdx_create_options[] = { + { + .name = BLOCK_OPT_SIZE, + .type = OPT_SIZE, + .help = "Virtual disk size; max of 64TB." + }, + { + .name = VHDX_BLOCK_OPT_LOG_SIZE, + .type = OPT_SIZE, + .value.n = 1 * MiB, + .help = "Log size; min 1MB." + }, + { + .name = VHDX_BLOCK_OPT_BLOCK_SIZE, + .type = OPT_SIZE, + .value.n = 0, + .help = "Block Size; min 1MB, max 256MB. " \ + "0 means auto-calculate based on image size." + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = OPT_STRING, + .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\ + "Default is 'dynamic'." + }, + { + .name = VHDX_BLOCK_OPT_ZERO, + .type = OPT_FLAG, + .help = "Force use of payload blocks of type 'ZERO'. Non-standard." + }, + { NULL } +}; + static BlockDriver bdrv_vhdx = { .format_name = "vhdx", .instance_size = sizeof(BDRVVHDXState), @@ -962,6 +1920,11 @@ static BlockDriver bdrv_vhdx = { .bdrv_reopen_prepare = vhdx_reopen_prepare, .bdrv_co_readv = vhdx_co_readv, .bdrv_co_writev = vhdx_co_writev, + .bdrv_create = vhdx_create, + .bdrv_get_info = vhdx_get_info, + .bdrv_check = vhdx_check, + + .create_options = vhdx_create_options, }; static void bdrv_vhdx_init(void) diff --git a/block/vhdx.h b/block/vhdx.h index fb687ed2d6..8103d4c446 100644 --- a/block/vhdx.h +++ b/block/vhdx.h @@ -6,9 +6,9 @@ * Authors: * Jeff Cody * - * This is based on the "VHDX Format Specification v0.95", published 4/12/2012 + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=29681 + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 * * This work is licensed under the terms of the GNU LGPL, version 2 or later. * See the COPYING.LIB file in the top-level directory. @@ -18,6 +18,11 @@ #ifndef BLOCK_VHDX_H #define BLOCK_VHDX_H +#define KiB (1 * 1024) +#define MiB (KiB * 1024) +#define GiB (MiB * 1024) +#define TiB ((uint64_t) GiB * 1024) + /* Structures and fields present in the VHDX file */ /* The header section has the following blocks, @@ -30,14 +35,15 @@ * 0.........64KB...........128KB........192KB..........256KB................1MB */ -#define VHDX_HEADER_BLOCK_SIZE (64*1024) +#define VHDX_HEADER_BLOCK_SIZE (64 * 1024) #define VHDX_FILE_ID_OFFSET 0 -#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE*1) -#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE*2) -#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE*3) - +#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE * 1) +#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 2) +#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE * 3) +#define VHDX_REGION_TABLE2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 4) +#define VHDX_HEADER_SECTION_END (1 * MiB) /* * A note on the use of MS-GUID fields. For more details on the GUID, * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier. @@ -55,10 +61,11 @@ /* These structures are ones that are defined in the VHDX specification * document */ +#define VHDX_FILE_SIGNATURE 0x656C696678646876ULL /* "vhdxfile" in ASCII */ typedef struct VHDXFileIdentifier { uint64_t signature; /* "vhdxfile" in ASCII */ uint16_t creator[256]; /* optional; utf-16 string to identify - the vhdx file creator. Diagnotistic + the vhdx file creator. Diagnostic only */ } VHDXFileIdentifier; @@ -67,7 +74,7 @@ typedef struct VHDXFileIdentifier { * Microsoft is not just 16 bytes though - it is a structure that is defined, * so we need to follow it here so that endianness does not trip us up */ -typedef struct MSGUID { +typedef struct QEMU_PACKED MSGUID { uint32_t data1; uint16_t data2; uint16_t data3; @@ -77,14 +84,15 @@ typedef struct MSGUID { #define guid_eq(a, b) \ (memcmp(&(a), &(b), sizeof(MSGUID)) == 0) -#define VHDX_HEADER_SIZE (4*1024) /* although the vhdx_header struct in disk - is only 582 bytes, for purposes of crc - the header is the first 4KB of the 64KB - block */ +#define VHDX_HEADER_SIZE (4 * 1024) /* although the vhdx_header struct in disk + is only 582 bytes, for purposes of crc + the header is the first 4KB of the 64KB + block */ /* The full header is 4KB, although the actual header data is much smaller. * But for the checksum calculation, it is over the entire 4KB structure, * not just the defined portion of it */ +#define VHDX_HEADER_SIGNATURE 0x64616568 typedef struct QEMU_PACKED VHDXHeader { uint32_t signature; /* "head" in ASCII */ uint32_t checksum; /* CRC-32C hash of the whole header */ @@ -92,7 +100,7 @@ typedef struct QEMU_PACKED VHDXHeader { VHDX file has 2 of these headers, and only the header with the highest sequence number is valid */ - MSGUID file_write_guid; /* 128 bit unique identifier. Must be + MSGUID file_write_guid; /* 128 bit unique identifier. Must be updated to new, unique value before the first modification is made to file */ @@ -114,9 +122,9 @@ typedef struct QEMU_PACKED VHDXHeader { there is no valid log. If non-zero, log entries with this guid are valid. */ - uint16_t log_version; /* version of the log format. Mustn't be - zero, unless log_guid is also zero */ - uint16_t version; /* version of th evhdx file. Currently, + uint16_t log_version; /* version of the log format. Must be + set to zero */ + uint16_t version; /* version of the vhdx file. Currently, only supported version is "1" */ uint32_t log_length; /* length of the log. Must be multiple of 1MB */ @@ -125,6 +133,7 @@ typedef struct QEMU_PACKED VHDXHeader { } VHDXHeader; /* Header for the region table block */ +#define VHDX_REGION_SIGNATURE 0x69676572 /* "regi" in ASCII */ typedef struct QEMU_PACKED VHDXRegionTableHeader { uint32_t signature; /* "regi" in ASCII */ uint32_t checksum; /* CRC-32C hash of the 64KB table */ @@ -151,7 +160,10 @@ typedef struct QEMU_PACKED VHDXRegionTableEntry { /* ---- LOG ENTRY STRUCTURES ---- */ +#define VHDX_LOG_MIN_SIZE (1024 * 1024) +#define VHDX_LOG_SECTOR_SIZE 4096 #define VHDX_LOG_HDR_SIZE 64 +#define VHDX_LOG_SIGNATURE 0x65676f6c typedef struct QEMU_PACKED VHDXLogEntryHeader { uint32_t signature; /* "loge" in ASCII */ uint32_t checksum; /* CRC-32C hash of the 64KB table */ @@ -174,7 +186,8 @@ typedef struct QEMU_PACKED VHDXLogEntryHeader { } VHDXLogEntryHeader; #define VHDX_LOG_DESC_SIZE 32 - +#define VHDX_LOG_DESC_SIGNATURE 0x63736564 +#define VHDX_LOG_ZERO_SIGNATURE 0x6f72657a typedef struct QEMU_PACKED VHDXLogDescriptor { uint32_t signature; /* "zero" or "desc" in ASCII */ union { @@ -194,6 +207,7 @@ typedef struct QEMU_PACKED VHDXLogDescriptor { vhdx_log_entry_header */ } VHDXLogDescriptor; +#define VHDX_LOG_DATA_SIGNATURE 0x61746164 typedef struct QEMU_PACKED VHDXLogDataSector { uint32_t data_signature; /* "data" in ASCII */ uint32_t sequence_high; /* 4 MSB of 8 byte sequence_number */ @@ -212,19 +226,19 @@ typedef struct QEMU_PACKED VHDXLogDataSector { #define PAYLOAD_BLOCK_UNDEFINED 1 #define PAYLOAD_BLOCK_ZERO 2 #define PAYLOAD_BLOCK_UNMAPPED 5 -#define PAYLOAD_BLOCK_FULL_PRESENT 6 +#define PAYLOAD_BLOCK_FULLY_PRESENT 6 #define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7 #define SB_BLOCK_NOT_PRESENT 0 #define SB_BLOCK_PRESENT 6 /* per the spec */ -#define VHDX_MAX_SECTORS_PER_BLOCK (1<<23) +#define VHDX_MAX_SECTORS_PER_BLOCK (1 << 23) /* upper 44 bits are the file offset in 1MB units lower 3 bits are the state other bits are reserved */ #define VHDX_BAT_STATE_BIT_MASK 0x07 -#define VHDX_BAT_FILE_OFF_BITS (64-44) +#define VHDX_BAT_FILE_OFF_MASK 0xFFFFFFFFFFF00000ULL /* upper 44 bits */ typedef uint64_t VHDXBatEntry; /* ---- METADATA REGION STRUCTURES ---- */ @@ -233,6 +247,7 @@ typedef uint64_t VHDXBatEntry; #define VHDX_METADATA_MAX_ENTRIES 2047 /* not including the header */ #define VHDX_METADATA_TABLE_MAX_SIZE \ (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1)) +#define VHDX_METADATA_SIGNATURE 0x617461646174656DULL /* "metadata" in ASCII */ typedef struct QEMU_PACKED VHDXMetadataTableHeader { uint64_t signature; /* "metadata" in ASCII */ uint16_t reserved; @@ -252,8 +267,8 @@ typedef struct QEMU_PACKED VHDXMetadataTableEntry { metadata region */ /* note: if length = 0, so is offset */ uint32_t length; /* length of metadata. <= 1MB. */ - uint32_t data_bits; /* least-significant 3 bits are flags, the - rest are reserved (see above) */ + uint32_t data_bits; /* least-significant 3 bits are flags, + the rest are reserved (see above) */ uint32_t reserved2; } VHDXMetadataTableEntry; @@ -262,13 +277,16 @@ typedef struct QEMU_PACKED VHDXMetadataTableEntry { If set indicates a fixed size VHDX file */ #define VHDX_PARAMS_HAS_PARENT 0x02 /* has parent / backing file */ +#define VHDX_BLOCK_SIZE_MIN (1 * MiB) +#define VHDX_BLOCK_SIZE_MAX (256 * MiB) typedef struct QEMU_PACKED VHDXFileParameters { uint32_t block_size; /* size of each payload block, always power of 2, <= 256MB and >= 1MB. */ - uint32_t data_bits; /* least-significant 2 bits are flags, the rest - are reserved (see above) */ + uint32_t data_bits; /* least-significant 2 bits are flags, + the rest are reserved (see above) */ } VHDXFileParameters; +#define VHDX_MAX_IMAGE_SIZE ((uint64_t) 64 * TiB) typedef struct QEMU_PACKED VHDXVirtualDiskSize { uint64_t virtual_disk_size; /* Size of the virtual disk, in bytes. Must be multiple of the sector size, @@ -276,7 +294,7 @@ typedef struct QEMU_PACKED VHDXVirtualDiskSize { } VHDXVirtualDiskSize; typedef struct QEMU_PACKED VHDXPage83Data { - MSGUID page_83_data[16]; /* unique id for scsi devices that + MSGUID page_83_data; /* unique id for scsi devices that support page 0x83 */ } VHDXPage83Data; @@ -291,7 +309,7 @@ typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize { } VHDXVirtualDiskPhysicalSectorSize; typedef struct QEMU_PACKED VHDXParentLocatorHeader { - MSGUID locator_type[16]; /* type of the parent virtual disk. */ + MSGUID locator_type; /* type of the parent virtual disk. */ uint16_t reserved; uint16_t key_value_count; /* number of key/value pairs for this locator */ @@ -308,18 +326,125 @@ typedef struct QEMU_PACKED VHDXParentLocatorEntry { /* ----- END VHDX SPECIFICATION STRUCTURES ---- */ +typedef struct VHDXMetadataEntries { + VHDXMetadataTableEntry file_parameters_entry; + VHDXMetadataTableEntry virtual_disk_size_entry; + VHDXMetadataTableEntry page83_data_entry; + VHDXMetadataTableEntry logical_sector_size_entry; + VHDXMetadataTableEntry phys_sector_size_entry; + VHDXMetadataTableEntry parent_locator_entry; + uint16_t present; +} VHDXMetadataEntries; + +typedef struct VHDXLogEntries { + uint64_t offset; + uint64_t length; + uint32_t write; + uint32_t read; + VHDXLogEntryHeader *hdr; + void *desc_buffer; + uint64_t sequence; + uint32_t tail; +} VHDXLogEntries; + +typedef struct VHDXRegionEntry { + uint64_t start; + uint64_t end; + QLIST_ENTRY(VHDXRegionEntry) entries; +} VHDXRegionEntry; + +typedef struct BDRVVHDXState { + CoMutex lock; + + int curr_header; + VHDXHeader *headers[2]; + + VHDXRegionTableHeader rt; + VHDXRegionTableEntry bat_rt; /* region table for the BAT */ + VHDXRegionTableEntry metadata_rt; /* region table for the metadata */ + + VHDXMetadataTableHeader metadata_hdr; + VHDXMetadataEntries metadata_entries; + + VHDXFileParameters params; + uint32_t block_size; + uint32_t block_size_bits; + uint32_t sectors_per_block; + uint32_t sectors_per_block_bits; + + uint64_t virtual_disk_size; + uint32_t logical_sector_size; + uint32_t physical_sector_size; + + uint64_t chunk_ratio; + uint32_t chunk_ratio_bits; + uint32_t logical_sector_size_bits; + + uint32_t bat_entries; + VHDXBatEntry *bat; + uint64_t bat_offset; + bool first_visible_write; + MSGUID session_guid; + + VHDXLogEntries log; + + VHDXParentLocatorHeader parent_header; + VHDXParentLocatorEntry *parent_entries; + + Error *migration_blocker; + + bool log_replayed_on_open; + + QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions; +} BDRVVHDXState; + +void vhdx_guid_generate(MSGUID *guid); + +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw, + MSGUID *log_guid); + +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset); uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, int crc_offset); bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp); + +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset); -static void leguid_to_cpus(MSGUID *guid) +static inline void leguid_to_cpus(MSGUID *guid) { le32_to_cpus(&guid->data1); le16_to_cpus(&guid->data2); le16_to_cpus(&guid->data3); } +static inline void cpu_to_leguids(MSGUID *guid) +{ + cpu_to_le32s(&guid->data1); + cpu_to_le16s(&guid->data2); + cpu_to_le16s(&guid->data3); +} + +void vhdx_header_le_import(VHDXHeader *h); +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h); +void vhdx_log_desc_le_import(VHDXLogDescriptor *d); +void vhdx_log_desc_le_export(VHDXLogDescriptor *d); +void vhdx_log_data_le_export(VHDXLogDataSector *d); +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr); +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr); +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr); +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr); +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e); +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e); +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e); +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e); +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s); + #endif diff --git a/block/vmdk.c b/block/vmdk.c index 346bb5cad9..480ea37d7c 100644 --- a/block/vmdk.c +++ b/block/vmdk.c @@ -105,18 +105,22 @@ typedef struct VmdkExtent { uint32_t l2_cache_offsets[L2_CACHE_SIZE]; uint32_t l2_cache_counts[L2_CACHE_SIZE]; - unsigned int cluster_sectors; + int64_t cluster_sectors; + char *type; } VmdkExtent; typedef struct BDRVVmdkState { CoMutex lock; uint64_t desc_offset; bool cid_updated; + bool cid_checked; + uint32_t cid; uint32_t parent_cid; int num_extents; /* Extent array with num_extents entries, ascend ordered by address */ VmdkExtent *extents; Error *migration_blocker; + char *create_type; } BDRVVmdkState; typedef struct VmdkMetaData { @@ -197,8 +201,6 @@ static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) } } -#define CHECK_CID 1 - #define SECTOR_SIZE 512 #define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */ #define BUF_SIZE 4096 @@ -215,8 +217,9 @@ static void vmdk_free_extents(BlockDriverState *bs) g_free(e->l1_table); g_free(e->l2_cache); g_free(e->l1_backup_table); + g_free(e->type); if (e->file != bs->file) { - bdrv_delete(e->file); + bdrv_unref(e->file); } } g_free(s->extents); @@ -259,7 +262,7 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) p_name = strstr(desc, cid_str); if (p_name != NULL) { p_name += cid_str_size; - sscanf(p_name, "%x", &cid); + sscanf(p_name, "%" SCNx32, &cid); } return cid; @@ -287,7 +290,7 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) p_name = strstr(desc, "CID"); if (p_name != NULL) { p_name += sizeof("CID"); - snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid); + snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); pstrcat(desc, sizeof(desc), tmp_desc); } @@ -301,19 +304,18 @@ static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) static int vmdk_is_cid_valid(BlockDriverState *bs) { -#ifdef CHECK_CID BDRVVmdkState *s = bs->opaque; BlockDriverState *p_bs = bs->backing_hd; uint32_t cur_pcid; - if (p_bs) { + if (!s->cid_checked && p_bs) { cur_pcid = vmdk_read_cid(p_bs, 0); if (s->parent_cid != cur_pcid) { /* CID not valid */ return 0; } } -#endif + s->cid_checked = true; /* CID valid */ return 1; } @@ -331,8 +333,7 @@ static int vmdk_reopen_prepare(BDRVReopenState *state, assert(state->bs != NULL); if (queue == NULL) { - error_set(errp, ERROR_CLASS_GENERIC_ERROR, - "No reopen queue for VMDK extents"); + error_setg(errp, "No reopen queue for VMDK extents"); goto exit; } @@ -391,15 +392,24 @@ static int vmdk_add_extent(BlockDriverState *bs, int64_t l1_offset, int64_t l1_backup_offset, uint32_t l1_size, int l2_size, uint64_t cluster_sectors, - VmdkExtent **new_extent) + VmdkExtent **new_extent, + Error **errp) { VmdkExtent *extent; BDRVVmdkState *s = bs->opaque; if (cluster_sectors > 0x200000) { /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ - error_report("invalid granularity, image may be corrupt"); - return -EINVAL; + error_setg(errp, "Invalid granularity, image may be corrupt"); + return -EFBIG; + } + if (l1_size > 512 * 1024 * 1024) { + /* Although with big capacity and small l1_entry_sectors, we can get a + * big l1_size, we don't want unbounded value to allocate the table. + * Limit it to 512M, which is 16PB for default cluster and L2 table + * size */ + error_setg(errp, "L1 size too big"); + return -EFBIG; } s->extents = g_realloc(s->extents, @@ -416,7 +426,7 @@ static int vmdk_add_extent(BlockDriverState *bs, extent->l1_size = l1_size; extent->l1_entry_sectors = l2_size * cluster_sectors; extent->l2_size = l2_size; - extent->cluster_sectors = cluster_sectors; + extent->cluster_sectors = flat ? sectors : cluster_sectors; if (s->num_extents > 1) { extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; @@ -430,7 +440,8 @@ static int vmdk_add_extent(BlockDriverState *bs, return 0; } -static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) +static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, + Error **errp) { int ret; int l1_size, i; @@ -439,10 +450,13 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) l1_size = extent->l1_size * sizeof(uint32_t); extent->l1_table = g_malloc(l1_size); ret = bdrv_pread(extent->file, - extent->l1_table_offset, - extent->l1_table, - l1_size); + extent->l1_table_offset, + extent->l1_table, + l1_size); if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read l1 table from extent '%s'", + extent->file->filename); goto fail_l1; } for (i = 0; i < extent->l1_size; i++) { @@ -452,10 +466,13 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) if (extent->l1_backup_table_offset) { extent->l1_backup_table = g_malloc(l1_size); ret = bdrv_pread(extent->file, - extent->l1_backup_table_offset, - extent->l1_backup_table, - l1_size); + extent->l1_backup_table_offset, + extent->l1_backup_table, + l1_size); if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read l1 backup table from extent '%s'", + extent->file->filename); goto fail_l1b; } for (i = 0; i < extent->l1_size; i++) { @@ -473,9 +490,9 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent) return ret; } -static int vmdk_open_vmdk3(BlockDriverState *bs, - BlockDriverState *file, - int flags) +static int vmdk_open_vmfs_sparse(BlockDriverState *bs, + BlockDriverState *file, + int flags, Error **errp) { int ret; uint32_t magic; @@ -484,20 +501,24 @@ static int vmdk_open_vmdk3(BlockDriverState *bs, ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read header from file '%s'", + file->filename); return ret; } - - ret = vmdk_add_extent(bs, - bs->file, false, - le32_to_cpu(header.disk_sectors), - le32_to_cpu(header.l1dir_offset) << 9, - 0, 1 << 6, 1 << 9, - le32_to_cpu(header.granularity), - &extent); + ret = vmdk_add_extent(bs, file, false, + le32_to_cpu(header.disk_sectors), + le32_to_cpu(header.l1dir_offset) << 9, + 0, + le32_to_cpu(header.l1dir_size), + 4096, + le32_to_cpu(header.granularity), + &extent, + errp); if (ret < 0) { return ret; } - ret = vmdk_init_tables(bs, extent); + ret = vmdk_init_tables(bs, extent, errp); if (ret) { /* free extent allocated by vmdk_add_extent */ vmdk_free_last_extent(bs); @@ -505,31 +526,71 @@ static int vmdk_open_vmdk3(BlockDriverState *bs, return ret; } -static int vmdk_open_desc_file(BlockDriverState *bs, int flags, - uint64_t desc_offset); +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, + Error **errp); + +static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, + Error **errp) +{ + int64_t size; + char *buf; + int ret; + + size = bdrv_getlength(file); + if (size < 0) { + error_setg_errno(errp, -size, "Could not access file"); + return NULL; + } + + size = MIN(size, 1 << 20); /* avoid unbounded allocation */ + buf = g_malloc0(size + 1); + + ret = bdrv_pread(file, desc_offset, buf, size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read from file"); + g_free(buf); + return NULL; + } + + return buf; +} static int vmdk_open_vmdk4(BlockDriverState *bs, BlockDriverState *file, - int flags) + int flags, Error **errp) { int ret; uint32_t magic; uint32_t l1_size, l1_entry_sectors; VMDK4Header header; VmdkExtent *extent; + BDRVVmdkState *s = bs->opaque; int64_t l1_backup_offset = 0; ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); if (ret < 0) { - return ret; + error_setg_errno(errp, -ret, + "Could not read header from file '%s'", + file->filename); + return -EINVAL; } if (header.capacity == 0) { uint64_t desc_offset = le64_to_cpu(header.desc_offset); if (desc_offset) { - return vmdk_open_desc_file(bs, flags, desc_offset << 9); + char *buf = vmdk_read_desc(file, desc_offset << 9, errp); + if (!buf) { + return -EINVAL; + } + ret = vmdk_open_desc_file(bs, flags, buf, errp); + g_free(buf); + return ret; } } + if (!s->create_type) { + s->create_type = g_strdup("monolithicSparse"); + } + if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { /* * The footer takes precedence over the header, so read it in. The @@ -577,17 +638,24 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, header = footer.header; } - if (le32_to_cpu(header.version) >= 3) { + if (le32_to_cpu(header.version) > 3) { char buf[64]; - snprintf(buf, sizeof(buf), "VMDK version %d", + snprintf(buf, sizeof(buf), "VMDK version %" PRId32, le32_to_cpu(header.version)); - qerror_report(QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bs->device_name, "vmdk", buf); + error_set(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bs->device_name, "vmdk", buf); return -ENOTSUP; + } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { + /* VMware KB 2064959 explains that version 3 added support for + * persistent changed block tracking (CBT), and backup software can + * read it as version=1 if it doesn't care about the changed area + * information. So we are safe to enable read only. */ + error_setg(errp, "VMDK version 3 must be read only"); + return -EINVAL; } if (le32_to_cpu(header.num_gtes_per_gt) > 512) { - error_report("L2 table size too big"); + error_setg(errp, "L2 table size too big"); return -EINVAL; } @@ -598,17 +666,17 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, } l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) / l1_entry_sectors; - if (l1_size > 512 * 1024 * 1024) { - /* although with big capacity and small l1_entry_sectors, we can get a - * big l1_size, we don't want unbounded value to allocate the table. - * Limit it to 512M, which is 16PB for default cluster and L2 table - * size */ - error_report("L1 size too big"); - return -EFBIG; - } if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; } + if (bdrv_getlength(file) < + le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE) { + error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", + (int64_t)(le64_to_cpu(header.grain_offset) + * BDRV_SECTOR_SIZE)); + return -EINVAL; + } + ret = vmdk_add_extent(bs, file, false, le64_to_cpu(header.capacity), le64_to_cpu(header.gd_offset) << 9, @@ -616,16 +684,21 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, l1_size, le32_to_cpu(header.num_gtes_per_gt), le64_to_cpu(header.granularity), - &extent); + &extent, + errp); if (ret < 0) { return ret; } extent->compressed = le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; + if (extent->compressed) { + g_free(s->create_type); + s->create_type = g_strdup("streamOptimized"); + } extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; extent->version = le32_to_cpu(header.version); extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; - ret = vmdk_init_tables(bs, extent); + ret = vmdk_init_tables(bs, extent, errp); if (ret) { /* free extent allocated by vmdk_add_extent */ vmdk_free_last_extent(bs); @@ -662,31 +735,28 @@ static int vmdk_parse_description(const char *desc, const char *opt_name, /* Open an extent file and append to bs array */ static int vmdk_open_sparse(BlockDriverState *bs, - BlockDriverState *file, - int flags) + BlockDriverState *file, int flags, + char *buf, Error **errp) { uint32_t magic; - if (bdrv_pread(file, 0, &magic, sizeof(magic)) != sizeof(magic)) { - return -EIO; - } - - magic = be32_to_cpu(magic); + magic = ldl_be_p(buf); switch (magic) { case VMDK3_MAGIC: - return vmdk_open_vmdk3(bs, file, flags); + return vmdk_open_vmfs_sparse(bs, file, flags, errp); break; case VMDK4_MAGIC: - return vmdk_open_vmdk4(bs, file, flags); + return vmdk_open_vmdk4(bs, file, flags, errp); break; default: - return -EMEDIUMTYPE; + error_setg(errp, "Image not in VMDK format"); + return -EINVAL; break; } } static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, - const char *desc_file_path) + const char *desc_file_path, Error **errp) { int ret; char access[11]; @@ -697,6 +767,8 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, int64_t flat_offset; char extent_path[PATH_MAX]; BlockDriverState *extent_file; + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent; while (*p) { /* parse extent line: @@ -711,116 +783,141 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, goto next_line; } else if (!strcmp(type, "FLAT")) { if (ret != 5 || flat_offset < 0) { + error_setg(errp, "Invalid extent lines: \n%s", p); + return -EINVAL; + } + } else if (!strcmp(type, "VMFS")) { + if (ret == 4) { + flat_offset = 0; + } else { + error_setg(errp, "Invalid extent lines:\n%s", p); return -EINVAL; } } else if (ret != 4) { + error_setg(errp, "Invalid extent lines:\n%s", p); return -EINVAL; } if (sectors <= 0 || - (strcmp(type, "FLAT") && strcmp(type, "SPARSE")) || + (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && + strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || (strcmp(access, "RW"))) { goto next_line; } path_combine(extent_path, sizeof(extent_path), desc_file_path, fname); - ret = bdrv_file_open(&extent_file, extent_path, NULL, bs->open_flags); + extent_file = NULL; + ret = bdrv_open(&extent_file, extent_path, NULL, NULL, + bs->open_flags | BDRV_O_PROTOCOL, NULL, errp); if (ret) { return ret; } /* save to extents array */ - if (!strcmp(type, "FLAT")) { + if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) { /* FLAT extent */ - VmdkExtent *extent; ret = vmdk_add_extent(bs, extent_file, true, sectors, - 0, 0, 0, 0, sectors, &extent); + 0, 0, 0, 0, 0, &extent, errp); if (ret < 0) { return ret; } extent->flat_start_offset = flat_offset << 9; - } else if (!strcmp(type, "SPARSE")) { - /* SPARSE extent */ - ret = vmdk_open_sparse(bs, extent_file, bs->open_flags); + } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { + /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ + char *buf = vmdk_read_desc(extent_file, 0, errp); + if (!buf) { + ret = -EINVAL; + } else { + ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, errp); + } if (ret) { - bdrv_delete(extent_file); + g_free(buf); + bdrv_unref(extent_file); return ret; } + extent = &s->extents[s->num_extents - 1]; } else { - fprintf(stderr, - "VMDK: Not supported extent type \"%s\""".\n", type); + error_setg(errp, "Unsupported extent type '%s'", type); return -ENOTSUP; } + extent->type = g_strdup(type); next_line: /* move to next line */ - while (*p && *p != '\n') { + while (*p) { + if (*p == '\n') { + p++; + break; + } p++; } - p++; } return 0; } -static int vmdk_open_desc_file(BlockDriverState *bs, int flags, - uint64_t desc_offset) +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, + Error **errp) { int ret; - char *buf = NULL; char ct[128]; BDRVVmdkState *s = bs->opaque; - int64_t size; - size = bdrv_getlength(bs->file); - if (size < 0) { - return -EINVAL; - } - - size = MIN(size, 1 << 20); /* avoid unbounded allocation */ - buf = g_malloc0(size + 1); - - ret = bdrv_pread(bs->file, desc_offset, buf, size); - if (ret < 0) { - goto exit; - } if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { - ret = -EMEDIUMTYPE; + error_setg(errp, "invalid VMDK image descriptor"); + ret = -EINVAL; goto exit; } if (strcmp(ct, "monolithicFlat") && + strcmp(ct, "vmfs") && + strcmp(ct, "vmfsSparse") && strcmp(ct, "twoGbMaxExtentSparse") && strcmp(ct, "twoGbMaxExtentFlat")) { - fprintf(stderr, - "VMDK: Not supported image type \"%s\""".\n", ct); + error_setg(errp, "Unsupported image type '%s'", ct); ret = -ENOTSUP; goto exit; } + s->create_type = g_strdup(ct); s->desc_offset = 0; - ret = vmdk_parse_extents(buf, bs, bs->file->filename); + ret = vmdk_parse_extents(buf, bs, bs->file->filename, errp); exit: - g_free(buf); return ret; } -static int vmdk_open(BlockDriverState *bs, QDict *options, int flags) +static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { + char *buf = NULL; int ret; BDRVVmdkState *s = bs->opaque; + uint32_t magic; - if (vmdk_open_sparse(bs, bs->file, flags) == 0) { - s->desc_offset = 0x200; - } else { - ret = vmdk_open_desc_file(bs, flags, 0); - if (ret) { - goto fail; - } + buf = vmdk_read_desc(bs->file, 0, errp); + if (!buf) { + return -EINVAL; + } + + magic = ldl_be_p(buf); + switch (magic) { + case VMDK3_MAGIC: + case VMDK4_MAGIC: + ret = vmdk_open_sparse(bs, bs->file, flags, buf, errp); + s->desc_offset = 0x200; + break; + default: + ret = vmdk_open_desc_file(bs, flags, buf, errp); + break; + } + if (ret) { + goto fail; } + /* try to open parent images, if exist */ ret = vmdk_parent_open(bs); if (ret) { goto fail; } + s->cid = vmdk_read_cid(bs, 0); s->parent_cid = vmdk_read_cid(bs, 1); qemu_co_mutex_init(&s->lock); @@ -829,14 +926,34 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags) QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, "vmdk", bs->device_name, "live migration"); migrate_add_blocker(s->migration_blocker); - + g_free(buf); return 0; fail: + g_free(buf); + g_free(s->create_type); + s->create_type = NULL; vmdk_free_extents(bs); return ret; } + +static int vmdk_refresh_limits(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + if (!s->extents[i].flat) { + bs->bl.write_zeroes_alignment = + MAX(bs->bl.write_zeroes_alignment, + s->extents[i].cluster_sectors); + } + } + + return 0; +} + static int get_whole_cluster(BlockDriverState *bs, VmdkExtent *extent, uint64_t cluster_offset, @@ -1039,7 +1156,7 @@ static VmdkExtent *find_extent(BDRVVmdkState *s, return NULL; } -static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { BDRVVmdkState *s = bs->opaque; @@ -1056,7 +1173,24 @@ static int coroutine_fn vmdk_co_is_allocated(BlockDriverState *bs, sector_num * 512, 0, &offset); qemu_co_mutex_unlock(&s->lock); - ret = (ret == VMDK_OK || ret == VMDK_ZEROED); + switch (ret) { + case VMDK_ERROR: + ret = -EIO; + break; + case VMDK_UNALLOC: + ret = 0; + break; + case VMDK_ZEROED: + ret = BDRV_BLOCK_ZERO; + break; + case VMDK_OK: + ret = BDRV_BLOCK_DATA; + if (extent->file == bs->file && !extent->compressed) { + ret |= BDRV_BLOCK_OFFSET_VALID | offset; + } + + break; + } index_in_cluster = sector_num % extent->cluster_sectors; n = extent->cluster_sectors - index_in_cluster; @@ -1254,15 +1388,14 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, { BDRVVmdkState *s = bs->opaque; VmdkExtent *extent = NULL; - int n, ret; - int64_t index_in_cluster; + int ret; + int64_t index_in_cluster, n; uint64_t extent_begin_sector, extent_relative_sector_num; uint64_t cluster_offset; VmdkMetaData m_data; if (sector_num > bs->total_sectors) { - fprintf(stderr, - "(VMDK) Wrong offset: sector_num=0x%" PRIx64 + error_report("Wrong offset: sector_num=0x%" PRIx64 " total_sectors=0x%" PRIx64 "\n", sector_num, bs->total_sectors); return -EIO; @@ -1282,9 +1415,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (extent->compressed) { if (ret == VMDK_OK) { /* Refuse write to allocated cluster for streamOptimized */ - fprintf(stderr, - "VMDK: can't write to allocated cluster" - " for streamOptimized\n"); + error_report("Could not write to allocated cluster" + " for streamOptimized"); return -EIO; } else { /* allocate */ @@ -1364,9 +1496,23 @@ static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, return ret; } +static int vmdk_write_compressed(BlockDriverState *bs, + int64_t sector_num, + const uint8_t *buf, + int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + if (s->num_extents == 1 && s->extents[0].compressed) { + return vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + } else { + return -ENOTSUP; + } +} + static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) + int nb_sectors, + BdrvRequestFlags flags) { int ret; BDRVVmdkState *s = bs->opaque; @@ -1381,25 +1527,36 @@ static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, return ret; } - static int vmdk_create_extent(const char *filename, int64_t filesize, - bool flat, bool compress, bool zeroed_grain) + bool flat, bool compress, bool zeroed_grain, + Error **errp) { int ret, i; - int fd = 0; + BlockDriverState *bs = NULL; VMDK4Header header; - uint32_t tmp, magic, grains, gd_size, gt_size, gt_count; + Error *local_err; + uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; + uint32_t *gd_buf = NULL; + int gd_buf_size; - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); - if (fd < 0) { - return -errno; + ret = bdrv_create_file(filename, NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; } + + assert(bs == NULL); + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + if (flat) { - ret = ftruncate(fd, filesize); + ret = bdrv_truncate(bs, filesize); if (ret < 0) { - ret = -errno; + error_setg_errno(errp, -ret, "Could not truncate file"); } goto exit; } @@ -1410,24 +1567,23 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; - header.capacity = filesize / 512; + header.capacity = filesize / BDRV_SECTOR_SIZE; header.granularity = 128; - header.num_gtes_per_gt = 512; + header.num_gtes_per_gt = BDRV_SECTOR_SIZE; - grains = (filesize / 512 + header.granularity - 1) / header.granularity; - gt_size = ((header.num_gtes_per_gt * sizeof(uint32_t)) + 511) >> 9; - gt_count = - (grains + header.num_gtes_per_gt - 1) / header.num_gtes_per_gt; - gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9; + grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); + gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), + BDRV_SECTOR_SIZE); + gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); + gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); header.desc_offset = 1; header.desc_size = 20; header.rgd_offset = header.desc_offset + header.desc_size; - header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count); + header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); header.grain_offset = - ((header.gd_offset + gd_size + (gt_size * gt_count) + - header.granularity - 1) / header.granularity) * - header.granularity; + ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), + header.granularity); /* swap endianness for all header fields */ header.version = cpu_to_le32(header.version); header.flags = cpu_to_le32(header.flags); @@ -1447,58 +1603,65 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = qemu_write_full(fd, &magic, sizeof(magic)); - if (ret != sizeof(magic)) { - ret = -errno; + ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); goto exit; } - ret = qemu_write_full(fd, &header, sizeof(header)); - if (ret != sizeof(header)) { - ret = -errno; + ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); goto exit; } - ret = ftruncate(fd, le64_to_cpu(header.grain_offset) << 9); + ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); if (ret < 0) { - ret = -errno; + error_setg_errno(errp, -ret, "Could not truncate file"); goto exit; } /* write grain directory */ - lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET); - for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_size; + gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; + gd_buf = g_malloc0(gd_buf_size); + for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; i < gt_count; i++, tmp += gt_size) { - ret = qemu_write_full(fd, &tmp, sizeof(tmp)); - if (ret != sizeof(tmp)) { - ret = -errno; - goto exit; - } + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); + goto exit; } /* write backup grain directory */ - lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET); - for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_size; + for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; i < gt_count; i++, tmp += gt_size) { - ret = qemu_write_full(fd, &tmp, sizeof(tmp)); - if (ret != sizeof(tmp)) { - ret = -errno; - goto exit; - } + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_set(errp, QERR_IO_ERROR); + goto exit; } ret = 0; - exit: - qemu_close(fd); +exit: + if (bs) { + bdrv_unref(bs); + } + g_free(gd_buf); return ret; } static int filename_decompose(const char *filename, char *path, char *prefix, - char *postfix, size_t buf_len) + char *postfix, size_t buf_len, Error **errp) { const char *p, *q; if (filename == NULL || !strlen(filename)) { - fprintf(stderr, "Vmdk: no filename provided.\n"); + error_setg(errp, "No filename provided"); return VMDK_ERROR; } p = strrchr(filename, '/'); @@ -1532,10 +1695,13 @@ static int filename_decompose(const char *filename, char *path, char *prefix, return VMDK_OK; } -static int vmdk_create(const char *filename, QEMUOptionParameter *options) +static int vmdk_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { - int fd, idx = 0; - char desc[BUF_SIZE]; + int idx = 0; + BlockDriverState *new_bs = NULL; + Error *local_err; + char *desc = NULL; int64_t total_size = 0, filesize; const char *adapter_type = NULL; const char *backing_file = NULL; @@ -1543,7 +1709,7 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) int flags = 0; int ret = 0; bool flat, split, compress; - char ext_desc_lines[BUF_SIZE] = ""; + GString *ext_desc_lines; char path[PATH_MAX], prefix[PATH_MAX], postfix[PATH_MAX]; const int64_t split_size = 0x80000000; /* VMDK has constant split size */ const char *desc_extent_line; @@ -1551,11 +1717,12 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) uint32_t parent_cid = 0xffffffff; uint32_t number_heads = 16; bool zeroed_grain = false; + uint32_t desc_offset = 0, desc_len; const char desc_template[] = "# Disk DescriptorFile\n" "version=1\n" - "CID=%x\n" - "parentCID=%x\n" + "CID=%" PRIx32 "\n" + "parentCID=%" PRIx32 "\n" "createType=\"%s\"\n" "%s" "\n" @@ -1567,12 +1734,15 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) "\n" "ddb.virtualHWVersion = \"%d\"\n" "ddb.geometry.cylinders = \"%" PRId64 "\"\n" - "ddb.geometry.heads = \"%d\"\n" + "ddb.geometry.heads = \"%" PRIu32 "\"\n" "ddb.geometry.sectors = \"63\"\n" "ddb.adapterType = \"%s\"\n"; - if (filename_decompose(filename, path, prefix, postfix, PATH_MAX)) { - return -EINVAL; + ext_desc_lines = g_string_new(NULL); + + if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) { + ret = -EINVAL; + goto exit; } /* Read out options */ while (options && options->name) { @@ -1597,8 +1767,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) strcmp(adapter_type, "buslogic") && strcmp(adapter_type, "lsilogic") && strcmp(adapter_type, "legacyESX")) { - fprintf(stderr, "VMDK: Unknown adapter type: '%s'.\n", adapter_type); - return -EINVAL; + error_setg(errp, "Unknown adapter type: '%s'", adapter_type); + ret = -EINVAL; + goto exit; } if (strcmp(adapter_type, "ide") != 0) { /* that's the number of heads with which vmware operates when @@ -1613,8 +1784,9 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) strcmp(fmt, "twoGbMaxExtentSparse") && strcmp(fmt, "twoGbMaxExtentFlat") && strcmp(fmt, "streamOptimized")) { - fprintf(stderr, "VMDK: Unknown subformat: %s\n", fmt); - return -EINVAL; + error_setg(errp, "Unknown subformat: '%s'", fmt); + ret = -EINVAL; + goto exit; } split = !(strcmp(fmt, "twoGbMaxExtentFlat") && strcmp(fmt, "twoGbMaxExtentSparse")); @@ -1622,27 +1794,34 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) strcmp(fmt, "twoGbMaxExtentFlat")); compress = !strcmp(fmt, "streamOptimized"); if (flat) { - desc_extent_line = "RW %lld FLAT \"%s\" 0\n"; + desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n"; } else { - desc_extent_line = "RW %lld SPARSE \"%s\"\n"; + desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n"; } if (flat && backing_file) { - /* not supporting backing file for flat image */ - return -ENOTSUP; + error_setg(errp, "Flat image can't have backing file"); + ret = -ENOTSUP; + goto exit; + } + if (flat && zeroed_grain) { + error_setg(errp, "Flat image can't enable zeroed grain"); + ret = -ENOTSUP; + goto exit; } if (backing_file) { - BlockDriverState *bs = bdrv_new(""); - ret = bdrv_open(bs, backing_file, NULL, 0, NULL); + BlockDriverState *bs = NULL; + ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_NO_BACKING, NULL, + errp); if (ret != 0) { - bdrv_delete(bs); - return ret; + goto exit; } if (strcmp(bs->drv->format_name, "vmdk")) { - bdrv_delete(bs); - return -EINVAL; + bdrv_unref(bs); + ret = -EINVAL; + goto exit; } parent_cid = vmdk_read_cid(bs, 0); - bdrv_delete(bs); + bdrv_unref(bs); snprintf(parent_desc_line, sizeof(parent_desc_line), "parentFileNameHint=\"%s\"", backing_file); } @@ -1672,51 +1851,66 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options) path, desc_filename); if (vmdk_create_extent(ext_filename, size, - flat, compress, zeroed_grain)) { - return -EINVAL; + flat, compress, zeroed_grain, errp)) { + ret = -EINVAL; + goto exit; } filesize -= size; /* Format description line */ snprintf(desc_line, sizeof(desc_line), - desc_extent_line, size / 512, desc_filename); - pstrcat(ext_desc_lines, sizeof(ext_desc_lines), desc_line); + desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename); + g_string_append(ext_desc_lines, desc_line); } /* generate descriptor file */ - snprintf(desc, sizeof(desc), desc_template, - (unsigned int)time(NULL), - parent_cid, - fmt, - parent_desc_line, - ext_desc_lines, - (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), - total_size / (int64_t)(63 * number_heads * 512), number_heads, - adapter_type); - if (split || flat) { - fd = qemu_open(filename, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, - 0644); + desc = g_strdup_printf(desc_template, + (uint32_t)time(NULL), + parent_cid, + fmt, + parent_desc_line, + ext_desc_lines->str, + (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), + total_size / + (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), + number_heads, + adapter_type); + desc_len = strlen(desc); + /* the descriptor offset = 0x200 */ + if (!split && !flat) { + desc_offset = 0x200; } else { - fd = qemu_open(filename, - O_WRONLY | O_BINARY | O_LARGEFILE, - 0644); - } - if (fd < 0) { - return -errno; + ret = bdrv_create_file(filename, options, &local_err); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not create image file"); + goto exit; + } } - /* the descriptor offset = 0x200 */ - if (!split && !flat && 0x200 != lseek(fd, 0x200, SEEK_SET)) { - ret = -errno; + assert(new_bs == NULL); + ret = bdrv_open(&new_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write description"); goto exit; } - ret = qemu_write_full(fd, desc, strlen(desc)); - if (ret != strlen(desc)) { - ret = -errno; + ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write description"); goto exit; } - ret = 0; + /* bdrv_pwrite write padding zeros to align to sector, we don't need that + * for description file */ + if (desc_offset == 0) { + ret = bdrv_truncate(new_bs, desc_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not truncate file"); + } + } exit: - qemu_close(fd); + if (new_bs) { + bdrv_unref(new_bs); + } + g_free(desc); + g_string_free(ext_desc_lines, true); return ret; } @@ -1725,6 +1919,7 @@ static void vmdk_close(BlockDriverState *bs) BDRVVmdkState *s = bs->opaque; vmdk_free_extents(bs); + g_free(s->create_type); migrate_del_blocker(s->migration_blocker); error_free(s->migration_blocker); @@ -1786,6 +1981,121 @@ static int vmdk_has_zero_init(BlockDriverState *bs) return 1; } +static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent) +{ + ImageInfo *info = g_new0(ImageInfo, 1); + + *info = (ImageInfo){ + .filename = g_strdup(extent->file->filename), + .format = g_strdup(extent->type), + .virtual_size = extent->sectors * BDRV_SECTOR_SIZE, + .compressed = extent->compressed, + .has_compressed = extent->compressed, + .cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE, + .has_cluster_size = !extent->flat, + }; + + return info; +} + +static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent = NULL; + int64_t sector_num = 0; + int64_t total_sectors = bdrv_getlength(bs) / BDRV_SECTOR_SIZE; + int ret; + uint64_t cluster_offset; + + if (fix) { + return -ENOTSUP; + } + + for (;;) { + if (sector_num >= total_sectors) { + return 0; + } + extent = find_extent(s, sector_num, extent); + if (!extent) { + fprintf(stderr, + "ERROR: could not find extent for sector %" PRId64 "\n", + sector_num); + break; + } + ret = get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, + 0, &cluster_offset); + if (ret == VMDK_ERROR) { + fprintf(stderr, + "ERROR: could not get cluster_offset for sector %" + PRId64 "\n", sector_num); + break; + } + if (ret == VMDK_OK && cluster_offset >= bdrv_getlength(extent->file)) { + fprintf(stderr, + "ERROR: cluster offset for sector %" + PRId64 " points after EOF\n", sector_num); + break; + } + sector_num += extent->cluster_sectors; + } + + result->corruptions++; + return 0; +} + +static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1); + ImageInfoList **next; + + *spec_info = (ImageInfoSpecific){ + .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK, + { + .vmdk = g_new0(ImageInfoSpecificVmdk, 1), + }, + }; + + *spec_info->vmdk = (ImageInfoSpecificVmdk) { + .create_type = g_strdup(s->create_type), + .cid = s->cid, + .parent_cid = s->parent_cid, + }; + + next = &spec_info->vmdk->extents; + for (i = 0; i < s->num_extents; i++) { + *next = g_new0(ImageInfoList, 1); + (*next)->value = vmdk_get_extent_info(&s->extents[i]); + (*next)->next = NULL; + next = &(*next)->next; + } + + return spec_info; +} + +static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + int i; + BDRVVmdkState *s = bs->opaque; + assert(s->num_extents); + bdi->needs_compressed_writes = s->extents[0].compressed; + if (!s->extents[0].flat) { + bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS; + } + /* See if we have multiple extents but they have different cases */ + for (i = 1; i < s->num_extents; i++) { + if (bdi->needs_compressed_writes != s->extents[i].compressed || + (bdi->cluster_size && bdi->cluster_size != + s->extents[i].cluster_sectors << BDRV_SECTOR_BITS)) { + return -ENOTSUP; + } + } + return 0; +} + static QEMUOptionParameter vmdk_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1828,16 +2138,21 @@ static BlockDriver bdrv_vmdk = { .instance_size = sizeof(BDRVVmdkState), .bdrv_probe = vmdk_probe, .bdrv_open = vmdk_open, + .bdrv_check = vmdk_check, .bdrv_reopen_prepare = vmdk_reopen_prepare, .bdrv_read = vmdk_co_read, .bdrv_write = vmdk_co_write, + .bdrv_write_compressed = vmdk_write_compressed, .bdrv_co_write_zeroes = vmdk_co_write_zeroes, .bdrv_close = vmdk_close, .bdrv_create = vmdk_create, .bdrv_co_flush_to_disk = vmdk_co_flush, - .bdrv_co_is_allocated = vmdk_co_is_allocated, + .bdrv_co_get_block_status = vmdk_co_get_block_status, .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, .bdrv_has_zero_init = vmdk_has_zero_init, + .bdrv_get_specific_info = vmdk_get_specific_info, + .bdrv_refresh_limits = vmdk_refresh_limits, + .bdrv_get_info = vmdk_get_info, .create_options = vmdk_create_options, }; diff --git a/block/vpc.c b/block/vpc.c index fe4f311d50..2e25f57230 100644 --- a/block/vpc.c +++ b/block/vpc.c @@ -45,8 +45,10 @@ enum vhd_type { // Seconds since Jan 1, 2000 0:00:00 (UTC) #define VHD_TIMESTAMP_BASE 946684800 +#define VHD_MAX_SECTORS (65535LL * 255 * 255) + // always big-endian -struct vhd_footer { +typedef struct vhd_footer { char creator[8]; // "conectix" uint32_t features; uint32_t version; @@ -79,9 +81,9 @@ struct vhd_footer { uint8_t uuid[16]; uint8_t in_saved_state; -}; +} QEMU_PACKED VHDFooter; -struct vhd_dyndisk_header { +typedef struct vhd_dyndisk_header { char magic[8]; // "cxsparse" // Offset of next header structure, 0xFFFFFFFF if none @@ -111,7 +113,7 @@ struct vhd_dyndisk_header { uint32_t reserved; uint64_t data_offset; } parent_locator[8]; -}; +} QEMU_PACKED VHDDynDiskHeader; typedef struct BDRVVPCState { CoMutex lock; @@ -155,14 +157,16 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } -static int vpc_open(BlockDriverState *bs, QDict *options, int flags) +static int vpc_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVPCState *s = bs->opaque; int i; - struct vhd_footer* footer; - struct vhd_dyndisk_header* dyndisk_header; + VHDFooter *footer; + VHDDynDiskHeader *dyndisk_header; uint8_t buf[HEADER_SIZE]; uint32_t checksum; + uint64_t computed_size; int disk_type = VHD_DYNAMIC; int ret; @@ -171,7 +175,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - footer = (struct vhd_footer*) s->footer_buf; + footer = (VHDFooter *) s->footer_buf; if (strncmp(footer->creator, "conectix", 8)) { int64_t offset = bdrv_getlength(bs->file); if (offset < 0) { @@ -189,7 +193,8 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } if (strncmp(footer->creator, "conectix", 8)) { - ret = -EMEDIUMTYPE; + error_setg(errp, "invalid VPC image"); + ret = -EINVAL; goto fail; } disk_type = VHD_FIXED; @@ -210,8 +215,17 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; + /* images created with disk2vhd report a far higher virtual size + * than expected with the cyls * heads * sectors_per_cyl formula. + * use the footer->size instead if the image was created with + * disk2vhd. + */ + if (!strncmp(footer->creator_app, "d2v", 4)) { + bs->total_sectors = be64_to_cpu(footer->size) / BDRV_SECTOR_SIZE; + } + /* Allow a maximum disk size of approximately 2 TB */ - if (bs->total_sectors >= 65535LL * 255 * 255) { + if (bs->total_sectors >= VHD_MAX_SECTORS) { ret = -EFBIG; goto fail; } @@ -223,7 +237,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) goto fail; } - dyndisk_header = (struct vhd_dyndisk_header *) buf; + dyndisk_header = (VHDDynDiskHeader *) buf; if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { ret = -EINVAL; @@ -231,10 +245,31 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) } s->block_size = be32_to_cpu(dyndisk_header->block_size); + if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) { + error_setg(errp, "Invalid block size %" PRIu32, s->block_size); + ret = -EINVAL; + goto fail; + } s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); - s->pagetable = g_malloc(s->max_table_entries * 4); + + if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) { + ret = -EINVAL; + goto fail; + } + if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) { + ret = -EINVAL; + goto fail; + } + + computed_size = (uint64_t) s->max_table_entries * s->block_size; + if (computed_size < bs->total_sectors * 512) { + ret = -EINVAL; + goto fail; + } + + s->pagetable = qemu_blockalign(bs, s->max_table_entries * 4); s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); @@ -259,6 +294,13 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) } } + if (s->free_data_block_offset > bdrv_getlength(bs->file)) { + error_setg(errp, "block-vpc: free_data_block_offset points after " + "the end of file. The image has been truncated."); + ret = -EINVAL; + goto fail; + } + s->last_bitmap_offset = (int64_t) -1; #ifdef CACHE @@ -280,7 +322,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags) return 0; fail: - g_free(s->pagetable); + qemu_vfree(s->pagetable); #ifdef CACHE g_free(s->pageentry_u8); #endif @@ -438,6 +480,19 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) return -1; } +static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVVPCState *s = (BDRVVPCState *)bs->opaque; + VHDFooter *footer = (VHDFooter *) s->footer_buf; + + if (cpu_to_be32(footer->type) != VHD_FIXED) { + bdi->cluster_size = s->block_size; + } + + bdi->unallocated_blocks_are_zero = true; + return 0; +} + static int vpc_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors) { @@ -445,7 +500,7 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num, int ret; int64_t offset; int64_t sectors, sectors_per_block; - struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + VHDFooter *footer = (VHDFooter *) s->footer_buf; if (cpu_to_be32(footer->type) == VHD_FIXED) { return bdrv_read(bs->file, sector_num, buf, nb_sectors); @@ -494,7 +549,7 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num, int64_t offset; int64_t sectors, sectors_per_block; int ret; - struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + VHDFooter *footer = (VHDFooter *) s->footer_buf; if (cpu_to_be32(footer->type) == VHD_FIXED) { return bdrv_write(bs->file, sector_num, buf, nb_sectors); @@ -596,8 +651,8 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, static int create_dynamic_disk(int fd, uint8_t *buf, int64_t total_sectors) { - struct vhd_dyndisk_header* dyndisk_header = - (struct vhd_dyndisk_header*) buf; + VHDDynDiskHeader *dyndisk_header = + (VHDDynDiskHeader *) buf; size_t block_size, num_bat_entries; int i; int ret = -EIO; @@ -683,10 +738,11 @@ static int create_fixed_disk(int fd, uint8_t *buf, int64_t total_size) return ret; } -static int vpc_create(const char *filename, QEMUOptionParameter *options) +static int vpc_create(const char *filename, QEMUOptionParameter *options, + Error **errp) { uint8_t buf[1024]; - struct vhd_footer *footer = (struct vhd_footer *) buf; + VHDFooter *footer = (VHDFooter *) buf; QEMUOptionParameter *disk_type_param; int fd, i; uint16_t cyls = 0; @@ -789,7 +845,7 @@ static int vpc_create(const char *filename, QEMUOptionParameter *options) static int vpc_has_zero_init(BlockDriverState *bs) { BDRVVPCState *s = bs->opaque; - struct vhd_footer *footer = (struct vhd_footer *) s->footer_buf; + VHDFooter *footer = (VHDFooter *) s->footer_buf; if (cpu_to_be32(footer->type) == VHD_FIXED) { return bdrv_has_zero_init(bs->file); @@ -801,7 +857,7 @@ static int vpc_has_zero_init(BlockDriverState *bs) static void vpc_close(BlockDriverState *bs) { BDRVVPCState *s = bs->opaque; - g_free(s->pagetable); + qemu_vfree(s->pagetable); #ifdef CACHE g_free(s->pageentry_u8); #endif @@ -839,6 +895,8 @@ static BlockDriver bdrv_vpc = { .bdrv_read = vpc_co_read, .bdrv_write = vpc_co_write, + .bdrv_get_info = vpc_get_info, + .create_options = vpc_create_options, .bdrv_has_zero_init = vpc_has_zero_init, }; diff --git a/block/vvfat.c b/block/vvfat.c index cd3b8edd9f..c3af7ff4c5 100644 --- a/block/vvfat.c +++ b/block/vvfat.c @@ -266,8 +266,7 @@ typedef struct mbr_t { } QEMU_PACKED mbr_t; typedef struct direntry_t { - uint8_t name[8]; - uint8_t extension[3]; + uint8_t name[8 + 3]; uint8_t attributes; uint8_t reserved[2]; uint16_t ctime; @@ -518,11 +517,9 @@ static inline uint8_t fat_chksum(const direntry_t* entry) uint8_t chksum=0; int i; - for(i=0;i<11;i++) { - unsigned char c; - - c = (i < 8) ? entry->name[i] : entry->extension[i-8]; - chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c; + for (i = 0; i < ARRAY_SIZE(entry->name); i++) { + chksum = (((chksum & 0xfe) >> 1) | + ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i]; } return chksum; @@ -617,7 +614,7 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, if(is_dot) { entry=array_get_next(&(s->directory)); - memset(entry->name,0x20,11); + memset(entry->name, 0x20, sizeof(entry->name)); memcpy(entry->name,filename,strlen(filename)); return entry; } @@ -632,12 +629,14 @@ static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, i = 8; entry=array_get_next(&(s->directory)); - memset(entry->name,0x20,11); + memset(entry->name, 0x20, sizeof(entry->name)); memcpy(entry->name, filename, i); - if(j > 0) - for (i = 0; i < 3 && filename[j+1+i]; i++) - entry->extension[i] = filename[j+1+i]; + if (j > 0) { + for (i = 0; i < 3 && filename[j + 1 + i]; i++) { + entry->name[8 + i] = filename[j + 1 + i]; + } + } /* upcase & remove unwanted characters */ for(i=10;i>=0;i--) { @@ -861,8 +860,7 @@ static int init_directories(BDRVVVFATState* s, { direntry_t* entry=array_get_next(&(s->directory)); entry->attributes=0x28; /* archive | volume label */ - memcpy(entry->name,"QEMU VVF",8); - memcpy(entry->extension,"AT ",3); + memcpy(entry->name, "QEMU VVFAT ", sizeof(entry->name)); } /* Now build FAT, and write back information into directory */ @@ -1065,7 +1063,8 @@ static void vvfat_parse_filename(const char *filename, QDict *options, qdict_put(options, "rw", qbool_from_int(rw)); } -static int vvfat_open(BlockDriverState *bs, QDict *options, int flags) +static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { BDRVVVFATState *s = bs->opaque; int cyls, heads, secs; @@ -1084,19 +1083,17 @@ DLOG(if (stderr == NULL) { setbuf(stderr, NULL); }) - opts = qemu_opts_create_nofail(&runtime_opts); + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); qemu_opts_absorb_qdict(opts, options, &local_err); - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); + if (local_err) { + error_propagate(errp, local_err); ret = -EINVAL; goto fail; } dirname = qemu_opt_get(opts, "dir"); if (!dirname) { - qerror_report(ERROR_CLASS_GENERIC_ERROR, "vvfat block driver requires " - "a 'dir' option"); + error_setg(errp, "vvfat block driver requires a 'dir' option"); ret = -EINVAL; goto fail; } @@ -1122,6 +1119,7 @@ DLOG(if (stderr == NULL) { if (!s->fat_type) { s->fat_type = 16; } + s->first_sectors_number = 0x40; cyls = s->fat_type == 12 ? 64 : 1024; heads = 16; secs = 63; @@ -1136,8 +1134,7 @@ DLOG(if (stderr == NULL) { case 12: break; default: - qerror_report(ERROR_CLASS_GENERIC_ERROR, "Valid FAT types are only " - "12, 16 and 32"); + error_setg(errp, "Valid FAT types are only 12, 16 and 32"); ret = -EINVAL; goto fail; } @@ -1150,7 +1147,6 @@ DLOG(if (stderr == NULL) { s->current_cluster=0xffffffff; - s->first_sectors_number=0x40; /* read only is the default for safety */ bs->read_only = 1; s->qcow = s->write_target = NULL; @@ -1590,17 +1586,20 @@ static int parse_short_name(BDRVVVFATState* s, lfn->name[i] = direntry->name[i]; } - for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--); + for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) { + } if (j >= 0) { lfn->name[i++] = '.'; lfn->name[i + j + 1] = '\0'; for (;j >= 0; j--) { - if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f) - return -2; - else if (s->downcase_short_names) - lfn->name[i + j] = qemu_tolower(direntry->extension[j]); - else - lfn->name[i + j] = direntry->extension[j]; + uint8_t c = direntry->name[8 + j]; + if (c <= ' ' || c > 0x7f) { + return -2; + } else if (s->downcase_short_names) { + lfn->name[i + j] = qemu_tolower(c); + } else { + lfn->name[i + j] = c; + } } } else lfn->name[i + j + 1] = '\0'; @@ -2874,16 +2873,17 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, return ret; } -static int coroutine_fn vvfat_co_is_allocated(BlockDriverState *bs, +static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int* n) { BDRVVVFATState* s = bs->opaque; *n = s->sector_count - sector_num; - if (*n > nb_sectors) - *n = nb_sectors; - else if (*n < 0) - return 0; - return 1; + if (*n > nb_sectors) { + *n = nb_sectors; + } else if (*n < 0) { + return 0; + } + return BDRV_BLOCK_DATA; } static int write_target_commit(BlockDriverState *bs, int64_t sector_num, @@ -2894,7 +2894,7 @@ static int write_target_commit(BlockDriverState *bs, int64_t sector_num, static void write_target_close(BlockDriverState *bs) { BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - bdrv_delete(s->qcow); + bdrv_unref(s->qcow); g_free(s->qcow_filename); } @@ -2908,6 +2908,7 @@ static int enable_write_target(BDRVVVFATState *s) { BlockDriver *bdrv_qcow; QEMUOptionParameter *options; + Error *local_err = NULL; int ret; int size = sector2cluster(s, s->sector_count); s->used_clusters = calloc(size, 1); @@ -2925,17 +2926,20 @@ static int enable_write_target(BDRVVVFATState *s) set_option_parameter_int(options, BLOCK_OPT_SIZE, s->sector_count * 512); set_option_parameter(options, BLOCK_OPT_BACKING_FILE, "fat:"); - ret = bdrv_create(bdrv_qcow, s->qcow_filename, options); + ret = bdrv_create(bdrv_qcow, s->qcow_filename, options, &local_err); if (ret < 0) { + qerror_report_err(local_err); + error_free(local_err); goto err; } - s->qcow = bdrv_new(""); - - ret = bdrv_open(s->qcow, s->qcow_filename, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow); + s->qcow = NULL; + ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, bdrv_qcow, + &local_err); if (ret < 0) { - bdrv_delete(s->qcow); + qerror_report_err(local_err); + error_free(local_err); goto err; } @@ -2943,7 +2947,7 @@ static int enable_write_target(BDRVVVFATState *s) unlink(s->qcow_filename); #endif - s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1); + s->bs->backing_hd = bdrv_new("", &error_abort); s->bs->backing_hd->drv = &vvfat_write_target; s->bs->backing_hd->opaque = g_malloc(sizeof(void*)); *(void**)s->bs->backing_hd->opaque = s; @@ -2984,7 +2988,7 @@ static BlockDriver bdrv_vvfat = { .bdrv_read = vvfat_co_read, .bdrv_write = vvfat_co_write, - .bdrv_co_is_allocated = vvfat_co_is_allocated, + .bdrv_co_get_block_status = vvfat_co_get_block_status, }; static void bdrv_vvfat_init(void) diff --git a/block/win32-aio.c b/block/win32-aio.c index fcb7c754da..5d1d199b61 100644 --- a/block/win32-aio.c +++ b/block/win32-aio.c @@ -105,13 +105,6 @@ static void win32_aio_completion_cb(EventNotifier *e) } } -static int win32_aio_flush_cb(EventNotifier *e) -{ - QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e); - - return (s->count > 0) ? 1 : 0; -} - static void win32_aio_cancel(BlockDriverAIOCB *blockacb) { QEMUWin32AIOCB *waiocb = (QEMUWin32AIOCB *)blockacb; @@ -201,8 +194,7 @@ QEMUWin32AIOState *win32_aio_init(void) goto out_close_efd; } - qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb, - win32_aio_flush_cb); + qemu_aio_set_event_notifier(&s->e, win32_aio_completion_cb); return s; diff --git a/blockdev-nbd.c b/blockdev-nbd.c index 95f10c81e3..922cf5657b 100644 --- a/blockdev-nbd.c +++ b/blockdev-nbd.c @@ -69,12 +69,6 @@ static void nbd_close_notifier(Notifier *n, void *data) g_free(cn); } -static void nbd_server_put_ref(NBDExport *exp) -{ - BlockDriverState *bs = nbd_export_get_blockdev(exp); - drive_put_ref(drive_get_by_blockdev(bs)); -} - void qmp_nbd_server_add(const char *device, bool has_writable, bool writable, Error **errp) { @@ -105,11 +99,9 @@ void qmp_nbd_server_add(const char *device, bool has_writable, bool writable, writable = false; } - exp = nbd_export_new(bs, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY, - nbd_server_put_ref); + exp = nbd_export_new(bs, 0, -1, writable ? 0 : NBD_FLAG_READ_ONLY, NULL); nbd_export_set_name(exp, device); - drive_get_ref(drive_get_by_blockdev(bs)); n = g_malloc0(sizeof(NBDCloseNotifier)); n->n.notify = nbd_close_notifier; diff --git a/blockdev.c b/blockdev.c index e174b7d02d..7810e9fb68 100644 --- a/blockdev.c +++ b/blockdev.c @@ -38,6 +38,8 @@ #include "qemu/option.h" #include "qemu/config-file.h" #include "qapi/qmp/types.h" +#include "qapi-visit.h" +#include "qapi/qmp-output-visitor.h" #include "sysemu/sysemu.h" #include "block/block_int.h" #include "qmp-commands.h" @@ -45,8 +47,6 @@ #include "sysemu/arch_init.h" static QTAILQ_HEAD(drivelist, DriveInfo) drives = QTAILQ_HEAD_INITIALIZER(drives); -extern QemuOptsList qemu_common_drive_opts; -extern QemuOptsList qemu_old_drive_opts; static const char *const if_name[IF_COUNT] = { [IF_NONE] = "none", @@ -90,6 +90,10 @@ void blockdev_mark_auto_del(BlockDriverState *bs) { DriveInfo *dinfo = drive_get_by_blockdev(bs); + if (dinfo && !dinfo->enable_auto_del) { + return; + } + if (bs->job) { block_job_cancel(bs->job); } @@ -212,8 +216,11 @@ static void bdrv_format_print(void *opaque, const char *name) static void drive_uninit(DriveInfo *dinfo) { - qemu_opts_del(dinfo->opts); - bdrv_delete(dinfo->bdrv); + if (dinfo->opts) { + qemu_opts_del(dinfo->opts); + } + + bdrv_unref(dinfo->bdrv); g_free(dinfo->id); QTAILQ_REMOVE(&drives, dinfo, next); g_free(dinfo->serial); @@ -235,36 +242,36 @@ void drive_get_ref(DriveInfo *dinfo) typedef struct { QEMUBH *bh; - DriveInfo *dinfo; -} DrivePutRefBH; + BlockDriverState *bs; +} BDRVPutRefBH; -static void drive_put_ref_bh(void *opaque) +static void bdrv_put_ref_bh(void *opaque) { - DrivePutRefBH *s = opaque; + BDRVPutRefBH *s = opaque; - drive_put_ref(s->dinfo); + bdrv_unref(s->bs); qemu_bh_delete(s->bh); g_free(s); } /* - * Release a drive reference in a BH + * Release a BDS reference in a BH * - * It is not possible to use drive_put_ref() from a callback function when the - * callers still need the drive. In such cases we schedule a BH to release the - * reference. + * It is not safe to use bdrv_unref() from a callback function when the callers + * still need the BlockDriverState. In such cases we schedule a BH to release + * the reference. */ -static void drive_put_ref_bh_schedule(DriveInfo *dinfo) +static void bdrv_put_ref_bh_schedule(BlockDriverState *bs) { - DrivePutRefBH *s; + BDRVPutRefBH *s; - s = g_new(DrivePutRefBH, 1); - s->bh = qemu_bh_new(drive_put_ref_bh, s); - s->dinfo = dinfo; + s = g_new(BDRVPutRefBH, 1); + s->bh = qemu_bh_new(bdrv_put_ref_bh, s); + s->bs = bs; qemu_bh_schedule(s->bh); } -static int parse_block_error_action(const char *buf, bool is_read) +static int parse_block_error_action(const char *buf, bool is_read, Error **errp) { if (!strcmp(buf, "ignore")) { return BLOCKDEV_ON_ERROR_IGNORE; @@ -275,93 +282,63 @@ static int parse_block_error_action(const char *buf, bool is_read) } else if (!strcmp(buf, "report")) { return BLOCKDEV_ON_ERROR_REPORT; } else { - error_report("'%s' invalid %s error action", - buf, is_read ? "read" : "write"); + error_setg(errp, "'%s' invalid %s error action", + buf, is_read ? "read" : "write"); return -1; } } -static bool do_check_io_limits(BlockIOLimit *io_limits, Error **errp) +static bool check_throttle_config(ThrottleConfig *cfg, Error **errp) { - bool bps_flag; - bool iops_flag; - - assert(io_limits); - - bps_flag = (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] != 0) - && ((io_limits->bps[BLOCK_IO_LIMIT_READ] != 0) - || (io_limits->bps[BLOCK_IO_LIMIT_WRITE] != 0)); - iops_flag = (io_limits->iops[BLOCK_IO_LIMIT_TOTAL] != 0) - && ((io_limits->iops[BLOCK_IO_LIMIT_READ] != 0) - || (io_limits->iops[BLOCK_IO_LIMIT_WRITE] != 0)); - if (bps_flag || iops_flag) { - error_setg(errp, "bps(iops) and bps_rd/bps_wr(iops_rd/iops_wr) " - "cannot be used at the same time"); + if (throttle_conflicting(cfg)) { + error_setg(errp, "bps/iops/max total values and read/write values" + " cannot be used at the same time"); return false; } - if (io_limits->bps[BLOCK_IO_LIMIT_TOTAL] < 0 || - io_limits->bps[BLOCK_IO_LIMIT_WRITE] < 0 || - io_limits->bps[BLOCK_IO_LIMIT_READ] < 0 || - io_limits->iops[BLOCK_IO_LIMIT_TOTAL] < 0 || - io_limits->iops[BLOCK_IO_LIMIT_WRITE] < 0 || - io_limits->iops[BLOCK_IO_LIMIT_READ] < 0) { - error_setg(errp, "bps and iops values must be 0 or greater"); + if (!throttle_is_valid(cfg)) { + error_setg(errp, "bps/iops/maxs values must be 0 or greater"); return false; } return true; } -static DriveInfo *blockdev_init(QemuOpts *all_opts, - BlockInterfaceType block_default_type) +typedef enum { MEDIA_DISK, MEDIA_CDROM } DriveMediaType; + +/* Takes the ownership of bs_opts */ +static DriveInfo *blockdev_init(const char *file, QDict *bs_opts, + Error **errp) { const char *buf; - const char *file = NULL; const char *serial; - const char *mediastr = ""; - BlockInterfaceType type; - enum { MEDIA_DISK, MEDIA_CDROM } media; - int bus_id, unit_id; - int cyls, heads, secs, translation; - int max_devs; - int index; int ro = 0; int bdrv_flags = 0; int on_read_error, on_write_error; - const char *devaddr; DriveInfo *dinfo; - BlockIOLimit io_limits; + ThrottleConfig cfg; int snapshot = 0; bool copy_on_read; int ret; Error *error = NULL; QemuOpts *opts; - QDict *bs_opts; const char *id; bool has_driver_specific_opts; BlockDriver *drv = NULL; - translation = BIOS_ATA_TRANSLATION_AUTO; - media = MEDIA_DISK; - - /* Check common options by copying from all_opts to opts, all other options - * are stored in bs_opts. */ - id = qemu_opts_id(all_opts); + /* Check common options by copying from bs_opts to opts, all other options + * stay in bs_opts for processing by bdrv_open(). */ + id = qdict_get_try_str(bs_opts, "id"); opts = qemu_opts_create(&qemu_common_drive_opts, id, 1, &error); - if (error_is_set(&error)) { - qerror_report_err(error); - error_free(error); + if (error) { + error_propagate(errp, error); return NULL; } - bs_opts = qdict_new(); - qemu_opts_to_qdict(all_opts, bs_opts); qemu_opts_absorb_qdict(opts, bs_opts, &error); - if (error_is_set(&error)) { - qerror_report_err(error); - error_free(error); - return NULL; + if (error) { + error_propagate(errp, error); + goto early_err; } if (id) { @@ -371,97 +348,26 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts, has_driver_specific_opts = !!qdict_size(bs_opts); /* extract parameters */ - bus_id = qemu_opt_get_number(opts, "bus", 0); - unit_id = qemu_opt_get_number(opts, "unit", -1); - index = qemu_opt_get_number(opts, "index", -1); - - cyls = qemu_opt_get_number(opts, "cyls", 0); - heads = qemu_opt_get_number(opts, "heads", 0); - secs = qemu_opt_get_number(opts, "secs", 0); - snapshot = qemu_opt_get_bool(opts, "snapshot", 0); ro = qemu_opt_get_bool(opts, "read-only", 0); copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false); - file = qemu_opt_get(opts, "file"); serial = qemu_opt_get(opts, "serial"); - if ((buf = qemu_opt_get(opts, "if")) != NULL) { - for (type = 0; type < IF_COUNT && strcmp(buf, if_name[type]); type++) - ; - if (type == IF_COUNT) { - error_report("unsupported bus type '%s'", buf); - return NULL; - } - } else { - type = block_default_type; - } - - max_devs = if_max_devs[type]; - - if (cyls || heads || secs) { - if (cyls < 1) { - error_report("invalid physical cyls number"); - return NULL; - } - if (heads < 1) { - error_report("invalid physical heads number"); - return NULL; - } - if (secs < 1) { - error_report("invalid physical secs number"); - return NULL; - } - } - - if ((buf = qemu_opt_get(opts, "trans")) != NULL) { - if (!cyls) { - error_report("'%s' trans must be used with cyls, heads and secs", - buf); - return NULL; - } - if (!strcmp(buf, "none")) - translation = BIOS_ATA_TRANSLATION_NONE; - else if (!strcmp(buf, "lba")) - translation = BIOS_ATA_TRANSLATION_LBA; - else if (!strcmp(buf, "auto")) - translation = BIOS_ATA_TRANSLATION_AUTO; - else { - error_report("'%s' invalid translation type", buf); - return NULL; - } - } - - if ((buf = qemu_opt_get(opts, "media")) != NULL) { - if (!strcmp(buf, "disk")) { - media = MEDIA_DISK; - } else if (!strcmp(buf, "cdrom")) { - if (cyls || secs || heads) { - error_report("CHS can't be set with media=%s", buf); - return NULL; - } - media = MEDIA_CDROM; - } else { - error_report("'%s' invalid media", buf); - return NULL; - } - } - if ((buf = qemu_opt_get(opts, "discard")) != NULL) { if (bdrv_parse_discard_flags(buf, &bdrv_flags) != 0) { - error_report("invalid discard option"); - return NULL; + error_setg(errp, "invalid discard option"); + goto early_err; } } - bdrv_flags = 0; if (qemu_opt_get_bool(opts, "cache.writeback", true)) { bdrv_flags |= BDRV_O_CACHE_WB; } if (qemu_opt_get_bool(opts, "cache.direct", false)) { bdrv_flags |= BDRV_O_NOCACHE; } - if (qemu_opt_get_bool(opts, "cache.no-flush", true)) { + if (qemu_opt_get_bool(opts, "cache.no-flush", false)) { bdrv_flags |= BDRV_O_NO_FLUSH; } @@ -472,8 +378,8 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts, } else if (!strcmp(buf, "threads")) { /* this is the default */ } else { - error_report("invalid aio option"); - return NULL; + error_setg(errp, "invalid aio option"); + goto early_err; } } #endif @@ -483,148 +389,79 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts, error_printf("Supported formats:"); bdrv_iterate_format(bdrv_format_print, NULL); error_printf("\n"); - return NULL; + goto early_err; } - drv = bdrv_find_whitelisted_format(buf, ro); + drv = bdrv_find_format(buf); if (!drv) { - error_report("'%s' invalid format", buf); - return NULL; + error_setg(errp, "'%s' invalid format", buf); + goto early_err; } } /* disk I/O throttling */ - io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = + memset(&cfg, 0, sizeof(cfg)); + cfg.buckets[THROTTLE_BPS_TOTAL].avg = qemu_opt_get_number(opts, "throttling.bps-total", 0); - io_limits.bps[BLOCK_IO_LIMIT_READ] = + cfg.buckets[THROTTLE_BPS_READ].avg = qemu_opt_get_number(opts, "throttling.bps-read", 0); - io_limits.bps[BLOCK_IO_LIMIT_WRITE] = + cfg.buckets[THROTTLE_BPS_WRITE].avg = qemu_opt_get_number(opts, "throttling.bps-write", 0); - io_limits.iops[BLOCK_IO_LIMIT_TOTAL] = + cfg.buckets[THROTTLE_OPS_TOTAL].avg = qemu_opt_get_number(opts, "throttling.iops-total", 0); - io_limits.iops[BLOCK_IO_LIMIT_READ] = + cfg.buckets[THROTTLE_OPS_READ].avg = qemu_opt_get_number(opts, "throttling.iops-read", 0); - io_limits.iops[BLOCK_IO_LIMIT_WRITE] = + cfg.buckets[THROTTLE_OPS_WRITE].avg = qemu_opt_get_number(opts, "throttling.iops-write", 0); - if (!do_check_io_limits(&io_limits, &error)) { - error_report("%s", error_get_pretty(error)); - error_free(error); - return NULL; - } + cfg.buckets[THROTTLE_BPS_TOTAL].max = + qemu_opt_get_number(opts, "throttling.bps-total-max", 0); + cfg.buckets[THROTTLE_BPS_READ].max = + qemu_opt_get_number(opts, "throttling.bps-read-max", 0); + cfg.buckets[THROTTLE_BPS_WRITE].max = + qemu_opt_get_number(opts, "throttling.bps-write-max", 0); + cfg.buckets[THROTTLE_OPS_TOTAL].max = + qemu_opt_get_number(opts, "throttling.iops-total-max", 0); + cfg.buckets[THROTTLE_OPS_READ].max = + qemu_opt_get_number(opts, "throttling.iops-read-max", 0); + cfg.buckets[THROTTLE_OPS_WRITE].max = + qemu_opt_get_number(opts, "throttling.iops-write-max", 0); - if (qemu_opt_get(opts, "boot") != NULL) { - fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be " - "ignored. Future versions will reject this parameter. Please " - "update your scripts.\n"); + cfg.op_size = qemu_opt_get_number(opts, "throttling.iops-size", 0); + + if (!check_throttle_config(&cfg, &error)) { + error_propagate(errp, error); + goto early_err; } on_write_error = BLOCKDEV_ON_ERROR_ENOSPC; if ((buf = qemu_opt_get(opts, "werror")) != NULL) { - if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && type != IF_NONE) { - error_report("werror is not supported by this bus type"); - return NULL; - } - - on_write_error = parse_block_error_action(buf, 0); - if (on_write_error < 0) { - return NULL; + on_write_error = parse_block_error_action(buf, 0, &error); + if (error) { + error_propagate(errp, error); + goto early_err; } } on_read_error = BLOCKDEV_ON_ERROR_REPORT; if ((buf = qemu_opt_get(opts, "rerror")) != NULL) { - if (type != IF_IDE && type != IF_VIRTIO && type != IF_SCSI && type != IF_NONE) { - error_report("rerror is not supported by this bus type"); - return NULL; - } - - on_read_error = parse_block_error_action(buf, 1); - if (on_read_error < 0) { - return NULL; - } - } - - if ((devaddr = qemu_opt_get(opts, "addr")) != NULL) { - if (type != IF_VIRTIO) { - error_report("addr is not supported by this bus type"); - return NULL; - } - } - - /* compute bus and unit according index */ - - if (index != -1) { - if (bus_id != 0 || unit_id != -1) { - error_report("index cannot be used with bus and unit"); - return NULL; + on_read_error = parse_block_error_action(buf, 1, &error); + if (error) { + error_propagate(errp, error); + goto early_err; } - bus_id = drive_index_to_bus_id(type, index); - unit_id = drive_index_to_unit_id(type, index); - } - - /* if user doesn't specify a unit_id, - * try to find the first free - */ - - if (unit_id == -1) { - unit_id = 0; - while (drive_get(type, bus_id, unit_id) != NULL) { - unit_id++; - if (max_devs && unit_id >= max_devs) { - unit_id -= max_devs; - bus_id++; - } - } - } - - /* check unit id */ - - if (max_devs && unit_id >= max_devs) { - error_report("unit %d too big (max is %d)", - unit_id, max_devs - 1); - return NULL; - } - - /* - * catch multiple definitions - */ - - if (drive_get(type, bus_id, unit_id) != NULL) { - error_report("drive with bus=%d, unit=%d (index=%d) exists", - bus_id, unit_id, index); - return NULL; } /* init */ - dinfo = g_malloc0(sizeof(*dinfo)); - if ((buf = qemu_opts_id(opts)) != NULL) { - dinfo->id = g_strdup(buf); - } else { - /* no id supplied -> create one */ - dinfo->id = g_malloc0(32); - if (type == IF_IDE || type == IF_SCSI) - mediastr = (media == MEDIA_CDROM) ? "-cd" : "-hd"; - if (max_devs) - snprintf(dinfo->id, 32, "%s%i%s%i", - if_name[type], bus_id, mediastr, unit_id); - else - snprintf(dinfo->id, 32, "%s%s%i", - if_name[type], mediastr, unit_id); - } - dinfo->bdrv = bdrv_new(dinfo->id); + dinfo->id = g_strdup(qemu_opts_id(opts)); + dinfo->bdrv = bdrv_new(dinfo->id, &error); + if (error) { + error_propagate(errp, error); + goto bdrv_new_err; + } dinfo->bdrv->open_flags = snapshot ? BDRV_O_SNAPSHOT : 0; dinfo->bdrv->read_only = ro; - dinfo->devaddr = devaddr; - dinfo->type = type; - dinfo->bus = bus_id; - dinfo->unit = unit_id; - dinfo->cyls = cyls; - dinfo->heads = heads; - dinfo->secs = secs; - dinfo->trans = translation; - dinfo->opts = all_opts; dinfo->refcount = 1; if (serial != NULL) { dinfo->serial = g_strdup(serial); @@ -634,42 +471,17 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts, bdrv_set_on_error(dinfo->bdrv, on_read_error, on_write_error); /* disk I/O throttling */ - bdrv_set_io_limits(dinfo->bdrv, &io_limits); - - switch(type) { - case IF_IDE: - case IF_SCSI: - case IF_XEN: - case IF_NONE: - dinfo->media_cd = media == MEDIA_CDROM; - break; - case IF_SD: - case IF_FLOPPY: - case IF_PFLASH: - case IF_MTD: - break; - case IF_VIRTIO: - { - /* add virtio block device */ - QemuOpts *devopts; - devopts = qemu_opts_create_nofail(qemu_find_opts("device")); - if (arch_type == QEMU_ARCH_S390X) { - qemu_opt_set(devopts, "driver", "virtio-blk-s390"); - } else { - qemu_opt_set(devopts, "driver", "virtio-blk-pci"); - } - qemu_opt_set(devopts, "drive", dinfo->id); - if (devaddr) - qemu_opt_set(devopts, "addr", devaddr); - break; - } - default: - abort(); + if (throttle_enabled(&cfg)) { + bdrv_io_limits_enable(dinfo->bdrv); + bdrv_set_io_limits(dinfo->bdrv, &cfg); } + if (!file || !*file) { if (has_driver_specific_opts) { file = NULL; } else { + QDECREF(bs_opts); + qemu_opts_del(opts); return dinfo; } } @@ -687,35 +499,15 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts, bdrv_flags |= BDRV_O_INCOMING; } - if (media == MEDIA_CDROM) { - /* CDROM is fine for any interface, don't check. */ - ro = 1; - } else if (ro == 1) { - if (type != IF_SCSI && type != IF_VIRTIO && type != IF_FLOPPY && - type != IF_NONE && type != IF_PFLASH) { - error_report("read-only not supported by this bus type"); - goto err; - } - } - bdrv_flags |= ro ? 0 : BDRV_O_RDWR; - if (ro && copy_on_read) { - error_report("warning: disabling copy_on_read on read-only drive"); - } - QINCREF(bs_opts); - ret = bdrv_open(dinfo->bdrv, file, bs_opts, bdrv_flags, drv); + ret = bdrv_open(&dinfo->bdrv, file, NULL, bs_opts, bdrv_flags, drv, &error); if (ret < 0) { - if (ret == -EMEDIUMTYPE) { - error_report("could not open disk image %s: not in %s format", - file ?: dinfo->id, drv ? drv->format_name : - qdict_get_str(bs_opts, "driver")); - } else { - error_report("could not open disk image %s: %s", - file ?: dinfo->id, strerror(-ret)); - } + error_setg(errp, "could not open disk image %s: %s", + file ?: dinfo->id, error_get_pretty(error)); + error_free(error); goto err; } @@ -728,12 +520,14 @@ static DriveInfo *blockdev_init(QemuOpts *all_opts, return dinfo; err: - qemu_opts_del(opts); - QDECREF(bs_opts); - bdrv_delete(dinfo->bdrv); - g_free(dinfo->id); + bdrv_unref(dinfo->bdrv); QTAILQ_REMOVE(&drives, dinfo, next); +bdrv_new_err: + g_free(dinfo->id); g_free(dinfo); +early_err: + QDECREF(bs_opts); + qemu_opts_del(opts); return NULL; } @@ -748,30 +542,100 @@ static void qemu_opt_rename(QemuOpts *opts, const char *from, const char *to) } } +QemuOptsList qemu_legacy_drive_opts = { + .name = "drive", + .head = QTAILQ_HEAD_INITIALIZER(qemu_legacy_drive_opts.head), + .desc = { + { + .name = "bus", + .type = QEMU_OPT_NUMBER, + .help = "bus number", + },{ + .name = "unit", + .type = QEMU_OPT_NUMBER, + .help = "unit number (i.e. lun for scsi)", + },{ + .name = "index", + .type = QEMU_OPT_NUMBER, + .help = "index number", + },{ + .name = "media", + .type = QEMU_OPT_STRING, + .help = "media type (disk, cdrom)", + },{ + .name = "if", + .type = QEMU_OPT_STRING, + .help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)", + },{ + .name = "cyls", + .type = QEMU_OPT_NUMBER, + .help = "number of cylinders (ide disk geometry)", + },{ + .name = "heads", + .type = QEMU_OPT_NUMBER, + .help = "number of heads (ide disk geometry)", + },{ + .name = "secs", + .type = QEMU_OPT_NUMBER, + .help = "number of sectors (ide disk geometry)", + },{ + .name = "trans", + .type = QEMU_OPT_STRING, + .help = "chs translation (auto, lba, none)", + },{ + .name = "boot", + .type = QEMU_OPT_BOOL, + .help = "(deprecated, ignored)", + },{ + .name = "addr", + .type = QEMU_OPT_STRING, + .help = "pci address (virtio only)", + },{ + .name = "file", + .type = QEMU_OPT_STRING, + .help = "file name", + }, + + /* Options that are passed on, but have special semantics with -drive */ + { + .name = "read-only", + .type = QEMU_OPT_BOOL, + .help = "open drive file as read-only", + },{ + .name = "rerror", + .type = QEMU_OPT_STRING, + .help = "read error action", + },{ + .name = "werror", + .type = QEMU_OPT_STRING, + .help = "write error action", + },{ + .name = "copy-on-read", + .type = QEMU_OPT_BOOL, + .help = "copy read data from backing file into image file", + }, + + { /* end of list */ } + }, +}; + DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type) { const char *value; - - /* - * Check that only old options are used by copying into a QemuOpts with - * stricter checks. Going through a QDict seems to be the easiest way to - * achieve this... - */ - QemuOpts* check_opts; - QDict *qdict; + DriveInfo *dinfo = NULL; + QDict *bs_opts; + QemuOpts *legacy_opts; + DriveMediaType media = MEDIA_DISK; + BlockInterfaceType type; + int cyls, heads, secs, translation; + int max_devs, bus_id, unit_id, index; + const char *devaddr; + const char *werror, *rerror; + bool read_only = false; + bool copy_on_read; + const char *filename; Error *local_err = NULL; - qdict = qemu_opts_to_qdict(all_opts, NULL); - check_opts = qemu_opts_from_qdict(&qemu_old_drive_opts, qdict, &local_err); - QDECREF(qdict); - - if (error_is_set(&local_err)) { - qerror_report_err(local_err); - error_free(local_err); - return NULL; - } - qemu_opts_del(check_opts); - /* Change legacy command line options into QMP ones */ qemu_opt_rename(all_opts, "iops", "throttling.iops-total"); qemu_opt_rename(all_opts, "iops_rd", "throttling.iops-read"); @@ -781,6 +645,17 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type) qemu_opt_rename(all_opts, "bps_rd", "throttling.bps-read"); qemu_opt_rename(all_opts, "bps_wr", "throttling.bps-write"); + qemu_opt_rename(all_opts, "iops_max", "throttling.iops-total-max"); + qemu_opt_rename(all_opts, "iops_rd_max", "throttling.iops-read-max"); + qemu_opt_rename(all_opts, "iops_wr_max", "throttling.iops-write-max"); + + qemu_opt_rename(all_opts, "bps_max", "throttling.bps-total-max"); + qemu_opt_rename(all_opts, "bps_rd_max", "throttling.bps-read-max"); + qemu_opt_rename(all_opts, "bps_wr_max", "throttling.bps-write-max"); + + qemu_opt_rename(all_opts, + "iops_size", "throttling.iops-size"); + qemu_opt_rename(all_opts, "readonly", "read-only"); value = qemu_opt_get(all_opts, "cache"); @@ -808,7 +683,262 @@ DriveInfo *drive_init(QemuOpts *all_opts, BlockInterfaceType block_default_type) qemu_opt_unset(all_opts, "cache"); } - return blockdev_init(all_opts, block_default_type); + /* Get a QDict for processing the options */ + bs_opts = qdict_new(); + qemu_opts_to_qdict(all_opts, bs_opts); + + legacy_opts = qemu_opts_create(&qemu_legacy_drive_opts, NULL, 0, + &error_abort); + qemu_opts_absorb_qdict(legacy_opts, bs_opts, &local_err); + if (local_err) { + qerror_report_err(local_err); + error_free(local_err); + goto fail; + } + + /* Deprecated option boot=[on|off] */ + if (qemu_opt_get(legacy_opts, "boot") != NULL) { + fprintf(stderr, "qemu-kvm: boot=on|off is deprecated and will be " + "ignored. Future versions will reject this parameter. Please " + "update your scripts.\n"); + } + + /* Media type */ + value = qemu_opt_get(legacy_opts, "media"); + if (value) { + if (!strcmp(value, "disk")) { + media = MEDIA_DISK; + } else if (!strcmp(value, "cdrom")) { + media = MEDIA_CDROM; + read_only = true; + } else { + error_report("'%s' invalid media", value); + goto fail; + } + } + + /* copy-on-read is disabled with a warning for read-only devices */ + read_only |= qemu_opt_get_bool(legacy_opts, "read-only", false); + copy_on_read = qemu_opt_get_bool(legacy_opts, "copy-on-read", false); + + if (read_only && copy_on_read) { + error_report("warning: disabling copy-on-read on read-only drive"); + copy_on_read = false; + } + + qdict_put(bs_opts, "read-only", + qstring_from_str(read_only ? "on" : "off")); + qdict_put(bs_opts, "copy-on-read", + qstring_from_str(copy_on_read ? "on" :"off")); + + /* Controller type */ + value = qemu_opt_get(legacy_opts, "if"); + if (value) { + for (type = 0; + type < IF_COUNT && strcmp(value, if_name[type]); + type++) { + } + if (type == IF_COUNT) { + error_report("unsupported bus type '%s'", value); + goto fail; + } + } else { + type = block_default_type; + } + + /* Geometry */ + cyls = qemu_opt_get_number(legacy_opts, "cyls", 0); + heads = qemu_opt_get_number(legacy_opts, "heads", 0); + secs = qemu_opt_get_number(legacy_opts, "secs", 0); + + if (cyls || heads || secs) { + if (cyls < 1) { + error_report("invalid physical cyls number"); + goto fail; + } + if (heads < 1) { + error_report("invalid physical heads number"); + goto fail; + } + if (secs < 1) { + error_report("invalid physical secs number"); + goto fail; + } + } + + translation = BIOS_ATA_TRANSLATION_AUTO; + value = qemu_opt_get(legacy_opts, "trans"); + if (value != NULL) { + if (!cyls) { + error_report("'%s' trans must be used with cyls, heads and secs", + value); + goto fail; + } + if (!strcmp(value, "none")) { + translation = BIOS_ATA_TRANSLATION_NONE; + } else if (!strcmp(value, "lba")) { + translation = BIOS_ATA_TRANSLATION_LBA; + } else if (!strcmp(value, "large")) { + translation = BIOS_ATA_TRANSLATION_LARGE; + } else if (!strcmp(value, "rechs")) { + translation = BIOS_ATA_TRANSLATION_RECHS; + } else if (!strcmp(value, "auto")) { + translation = BIOS_ATA_TRANSLATION_AUTO; + } else { + error_report("'%s' invalid translation type", value); + goto fail; + } + } + + if (media == MEDIA_CDROM) { + if (cyls || secs || heads) { + error_report("CHS can't be set with media=cdrom"); + goto fail; + } + } + + /* Device address specified by bus/unit or index. + * If none was specified, try to find the first free one. */ + bus_id = qemu_opt_get_number(legacy_opts, "bus", 0); + unit_id = qemu_opt_get_number(legacy_opts, "unit", -1); + index = qemu_opt_get_number(legacy_opts, "index", -1); + + max_devs = if_max_devs[type]; + + if (index != -1) { + if (bus_id != 0 || unit_id != -1) { + error_report("index cannot be used with bus and unit"); + goto fail; + } + bus_id = drive_index_to_bus_id(type, index); + unit_id = drive_index_to_unit_id(type, index); + } + + if (unit_id == -1) { + unit_id = 0; + while (drive_get(type, bus_id, unit_id) != NULL) { + unit_id++; + if (max_devs && unit_id >= max_devs) { + unit_id -= max_devs; + bus_id++; + } + } + } + + if (max_devs && unit_id >= max_devs) { + error_report("unit %d too big (max is %d)", unit_id, max_devs - 1); + goto fail; + } + + if (drive_get(type, bus_id, unit_id) != NULL) { + error_report("drive with bus=%d, unit=%d (index=%d) exists", + bus_id, unit_id, index); + goto fail; + } + + /* no id supplied -> create one */ + if (qemu_opts_id(all_opts) == NULL) { + char *new_id; + const char *mediastr = ""; + if (type == IF_IDE || type == IF_SCSI) { + mediastr = (media == MEDIA_CDROM) ? "-cd" : "-hd"; + } + if (max_devs) { + new_id = g_strdup_printf("%s%i%s%i", if_name[type], bus_id, + mediastr, unit_id); + } else { + new_id = g_strdup_printf("%s%s%i", if_name[type], + mediastr, unit_id); + } + qdict_put(bs_opts, "id", qstring_from_str(new_id)); + g_free(new_id); + } + + /* Add virtio block device */ + devaddr = qemu_opt_get(legacy_opts, "addr"); + if (devaddr && type != IF_VIRTIO) { + error_report("addr is not supported by this bus type"); + goto fail; + } + + if (type == IF_VIRTIO) { + QemuOpts *devopts; + devopts = qemu_opts_create(qemu_find_opts("device"), NULL, 0, + &error_abort); + if (arch_type == QEMU_ARCH_S390X) { + qemu_opt_set(devopts, "driver", "virtio-blk-s390"); + } else { + qemu_opt_set(devopts, "driver", "virtio-blk-pci"); + } + qemu_opt_set(devopts, "drive", qdict_get_str(bs_opts, "id")); + if (devaddr) { + qemu_opt_set(devopts, "addr", devaddr); + } + } + + filename = qemu_opt_get(legacy_opts, "file"); + + /* Check werror/rerror compatibility with if=... */ + werror = qemu_opt_get(legacy_opts, "werror"); + if (werror != NULL) { + if (type != IF_IDE && type != IF_SCSI && type != IF_VIRTIO && + type != IF_NONE) { + error_report("werror is not supported by this bus type"); + goto fail; + } + qdict_put(bs_opts, "werror", qstring_from_str(werror)); + } + + rerror = qemu_opt_get(legacy_opts, "rerror"); + if (rerror != NULL) { + if (type != IF_IDE && type != IF_VIRTIO && type != IF_SCSI && + type != IF_NONE) { + error_report("rerror is not supported by this bus type"); + goto fail; + } + qdict_put(bs_opts, "rerror", qstring_from_str(rerror)); + } + + /* Actual block device init: Functionality shared with blockdev-add */ + dinfo = blockdev_init(filename, bs_opts, &local_err); + if (dinfo == NULL) { + if (local_err) { + qerror_report_err(local_err); + error_free(local_err); + } + goto fail; + } else { + assert(!local_err); + } + + /* Set legacy DriveInfo fields */ + dinfo->enable_auto_del = true; + dinfo->opts = all_opts; + + dinfo->cyls = cyls; + dinfo->heads = heads; + dinfo->secs = secs; + dinfo->trans = translation; + + dinfo->type = type; + dinfo->bus = bus_id; + dinfo->unit = unit_id; + dinfo->devaddr = devaddr; + + switch(type) { + case IF_IDE: + case IF_SCSI: + case IF_XEN: + case IF_NONE: + dinfo->media_cd = media == MEDIA_CDROM; + break; + default: + break; + } + +fail: + qemu_opts_del(legacy_opts); + return dinfo; } void do_commit(Monitor *mon, const QDict *qdict) @@ -845,14 +975,22 @@ static void blockdev_do_action(int kind, void *data, Error **errp) qmp_transaction(&list, errp); } -void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file, +void qmp_blockdev_snapshot_sync(bool has_device, const char *device, + bool has_node_name, const char *node_name, + const char *snapshot_file, + bool has_snapshot_node_name, + const char *snapshot_node_name, bool has_format, const char *format, - bool has_mode, enum NewImageMode mode, - Error **errp) + bool has_mode, NewImageMode mode, Error **errp) { BlockdevSnapshot snapshot = { + .has_device = has_device, .device = (char *) device, + .has_node_name = has_node_name, + .node_name = (char *) node_name, .snapshot_file = (char *) snapshot_file, + .has_snapshot_node_name = has_snapshot_node_name, + .snapshot_node_name = (char *) snapshot_node_name, .has_format = has_format, .format = (char *) format, .has_mode = has_mode, @@ -862,6 +1000,80 @@ void qmp_blockdev_snapshot_sync(const char *device, const char *snapshot_file, &snapshot, errp); } +void qmp_blockdev_snapshot_internal_sync(const char *device, + const char *name, + Error **errp) +{ + BlockdevSnapshotInternal snapshot = { + .device = (char *) device, + .name = (char *) name + }; + + blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC, + &snapshot, errp); +} + +SnapshotInfo *qmp_blockdev_snapshot_delete_internal_sync(const char *device, + bool has_id, + const char *id, + bool has_name, + const char *name, + Error **errp) +{ + BlockDriverState *bs = bdrv_find(device); + QEMUSnapshotInfo sn; + Error *local_err = NULL; + SnapshotInfo *info = NULL; + int ret; + + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return NULL; + } + + if (!has_id) { + id = NULL; + } + + if (!has_name) { + name = NULL; + } + + if (!id && !name) { + error_setg(errp, "Name or id must be provided"); + return NULL; + } + + ret = bdrv_snapshot_find_by_id_and_name(bs, id, name, &sn, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return NULL; + } + if (!ret) { + error_setg(errp, + "Snapshot with id '%s' and name '%s' does not exist on " + "device '%s'", + STR_OR_NULL(id), STR_OR_NULL(name), device); + return NULL; + } + + bdrv_snapshot_delete(bs, id, name, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return NULL; + } + + info = g_malloc0(sizeof(SnapshotInfo)); + info->id = g_strdup(sn.id_str); + info->name = g_strdup(sn.name); + info->date_nsec = sn.date_nsec; + info->date_sec = sn.date_sec; + info->vm_state_size = sn.vm_state_size; + info->vm_clock_nsec = sn.vm_clock_nsec % 1000000000; + info->vm_clock_sec = sn.vm_clock_nsec / 1000000000; + + return info; +} /* New and old BlockDriverState structs for group snapshots */ @@ -882,16 +1094,130 @@ typedef struct BdrvActionOps { void (*clean)(BlkTransactionState *common); } BdrvActionOps; -/* - * This structure must be arranged as first member in child type, assuming - * that compiler will also arrange it to the same address with parent instance. - * Later it will be used in free(). - */ -struct BlkTransactionState { - TransactionAction *action; - const BdrvActionOps *ops; - QSIMPLEQ_ENTRY(BlkTransactionState) entry; -}; +/* + * This structure must be arranged as first member in child type, assuming + * that compiler will also arrange it to the same address with parent instance. + * Later it will be used in free(). + */ +struct BlkTransactionState { + TransactionAction *action; + const BdrvActionOps *ops; + QSIMPLEQ_ENTRY(BlkTransactionState) entry; +}; + +/* internal snapshot private data */ +typedef struct InternalSnapshotState { + BlkTransactionState common; + BlockDriverState *bs; + QEMUSnapshotInfo sn; +} InternalSnapshotState; + +static void internal_snapshot_prepare(BlkTransactionState *common, + Error **errp) +{ + Error *local_err = NULL; + const char *device; + const char *name; + BlockDriverState *bs; + QEMUSnapshotInfo old_sn, *sn; + bool ret; + qemu_timeval tv; + BlockdevSnapshotInternal *internal; + InternalSnapshotState *state; + int ret1; + + g_assert(common->action->kind == + TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC); + internal = common->action->blockdev_snapshot_internal_sync; + state = DO_UPCAST(InternalSnapshotState, common, common); + + /* 1. parse input */ + device = internal->device; + name = internal->name; + + /* 2. check for validation */ + bs = bdrv_find(device); + if (!bs) { + error_set(errp, QERR_DEVICE_NOT_FOUND, device); + return; + } + + if (!bdrv_is_inserted(bs)) { + error_set(errp, QERR_DEVICE_HAS_NO_MEDIUM, device); + return; + } + + if (bdrv_is_read_only(bs)) { + error_set(errp, QERR_DEVICE_IS_READ_ONLY, device); + return; + } + + if (!bdrv_can_snapshot(bs)) { + error_set(errp, QERR_BLOCK_FORMAT_FEATURE_NOT_SUPPORTED, + bs->drv->format_name, device, "internal snapshot"); + return; + } + + if (!strlen(name)) { + error_setg(errp, "Name is empty"); + return; + } + + /* check whether a snapshot with name exist */ + ret = bdrv_snapshot_find_by_id_and_name(bs, NULL, name, &old_sn, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } else if (ret) { + error_setg(errp, + "Snapshot with name '%s' already exists on device '%s'", + name, device); + return; + } + + /* 3. take the snapshot */ + sn = &state->sn; + pstrcpy(sn->name, sizeof(sn->name), name); + qemu_gettimeofday(&tv); + sn->date_sec = tv.tv_sec; + sn->date_nsec = tv.tv_usec * 1000; + sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); + + ret1 = bdrv_snapshot_create(bs, sn); + if (ret1 < 0) { + error_setg_errno(errp, -ret1, + "Failed to create snapshot '%s' on device '%s'", + name, device); + return; + } + + /* 4. succeed, mark a snapshot is created */ + state->bs = bs; +} + +static void internal_snapshot_abort(BlkTransactionState *common) +{ + InternalSnapshotState *state = + DO_UPCAST(InternalSnapshotState, common, common); + BlockDriverState *bs = state->bs; + QEMUSnapshotInfo *sn = &state->sn; + Error *local_error = NULL; + + if (!bs) { + return; + } + + if (bdrv_snapshot_delete(bs, sn->id_str, sn->name, &local_error) < 0) { + error_report("Failed to delete snapshot with id '%s' and name '%s' on " + "device '%s' in abort: %s", + sn->id_str, + sn->name, + bdrv_get_device_name(bs), + error_get_pretty(local_error)); + error_free(local_error); + } +} /* external snapshot private data */ typedef struct ExternalSnapshotState { @@ -905,8 +1231,14 @@ static void external_snapshot_prepare(BlkTransactionState *common, { BlockDriver *drv; int flags, ret; + QDict *options = NULL; Error *local_err = NULL; + bool has_device = false; const char *device; + bool has_node_name = false; + const char *node_name; + bool has_snapshot_node_name = false; + const char *snapshot_node_name; const char *new_image_file; const char *format = "qcow2"; enum NewImageMode mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS; @@ -917,7 +1249,14 @@ static void external_snapshot_prepare(BlkTransactionState *common, /* get parameters */ g_assert(action->kind == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC); + has_device = action->blockdev_snapshot_sync->has_device; device = action->blockdev_snapshot_sync->device; + has_node_name = action->blockdev_snapshot_sync->has_node_name; + node_name = action->blockdev_snapshot_sync->node_name; + has_snapshot_node_name = + action->blockdev_snapshot_sync->has_snapshot_node_name; + snapshot_node_name = action->blockdev_snapshot_sync->snapshot_node_name; + new_image_file = action->blockdev_snapshot_sync->snapshot_file; if (action->blockdev_snapshot_sync->has_format) { format = action->blockdev_snapshot_sync->format; @@ -933,9 +1272,21 @@ static void external_snapshot_prepare(BlkTransactionState *common, return; } - state->old_bs = bdrv_find(device); - if (!state->old_bs) { - error_set(errp, QERR_DEVICE_NOT_FOUND, device); + state->old_bs = bdrv_lookup_bs(has_device ? device : NULL, + has_node_name ? node_name : NULL, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + if (has_node_name && !has_snapshot_node_name) { + error_setg(errp, "New snapshot node name missing"); + return; + } + + if (has_snapshot_node_name && bdrv_find_node(snapshot_node_name)) { + error_setg(errp, "New snapshot node name already existing"); return; } @@ -956,6 +1307,11 @@ static void external_snapshot_prepare(BlkTransactionState *common, } } + if (!bdrv_is_first_non_filter(state->old_bs)) { + error_set(errp, QERR_FEATURE_DISABLED, "snapshot"); + return; + } + flags = state->old_bs->open_flags; /* create new image w/backing file */ @@ -964,20 +1320,26 @@ static void external_snapshot_prepare(BlkTransactionState *common, state->old_bs->filename, state->old_bs->drv->format_name, NULL, -1, flags, &local_err, false); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); return; } } - /* We will manually add the backing_hd field to the bs later */ - state->new_bs = bdrv_new(""); + if (has_snapshot_node_name) { + options = qdict_new(); + qdict_put(options, "node-name", + qstring_from_str(snapshot_node_name)); + } + /* TODO Inherit bs->options or only take explicit options with an * extended QMP command? */ - ret = bdrv_open(state->new_bs, new_image_file, NULL, - flags | BDRV_O_NO_BACKING, drv); + assert(state->new_bs == NULL); + ret = bdrv_open(&state->new_bs, new_image_file, NULL, options, + flags | BDRV_O_NO_BACKING, drv, &local_err); + /* We will manually add the backing_hd field to the bs later */ if (ret != 0) { - error_setg_file_open(errp, -ret, new_image_file); + error_propagate(errp, local_err); } } @@ -1000,7 +1362,7 @@ static void external_snapshot_abort(BlkTransactionState *common) ExternalSnapshotState *state = DO_UPCAST(ExternalSnapshotState, common, common); if (state->new_bs) { - bdrv_delete(state->new_bs); + bdrv_unref(state->new_bs); } } @@ -1027,7 +1389,7 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp) backup->has_on_source_error, backup->on_source_error, backup->has_on_target_error, backup->on_target_error, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); state->bs = NULL; state->job = NULL; @@ -1076,6 +1438,11 @@ static const BdrvActionOps actions[] = { .prepare = abort_prepare, .commit = abort_commit, }, + [TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_INTERNAL_SYNC] = { + .instance_size = sizeof(InternalSnapshotState), + .prepare = internal_snapshot_prepare, + .abort = internal_snapshot_abort, + }, }; /* @@ -1106,13 +1473,15 @@ void qmp_transaction(TransactionActionList *dev_list, Error **errp) assert(dev_info->kind < ARRAY_SIZE(actions)); ops = &actions[dev_info->kind]; + assert(ops->instance_size > 0); + state = g_malloc0(ops->instance_size); state->ops = ops; state->action = dev_info; QSIMPLEQ_INSERT_TAIL(&snap_bdrv_states, state, entry); state->ops->prepare(state, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); goto delete_and_fail; } @@ -1154,14 +1523,16 @@ static void eject_device(BlockDriverState *bs, int force, Error **errp) return; } if (!bdrv_dev_has_removable_media(bs)) { - error_set(errp, QERR_DEVICE_NOT_REMOVABLE, bdrv_get_device_name(bs)); + error_setg(errp, "Device '%s' is not removable", + bdrv_get_device_name(bs)); return; } if (bdrv_dev_is_medium_locked(bs) && !bdrv_dev_is_tray_open(bs)) { bdrv_dev_eject_request(bs, force); if (!force) { - error_set(errp, QERR_DEVICE_LOCKED, bdrv_get_device_name(bs)); + error_setg(errp, "Device '%s' is locked", + bdrv_get_device_name(bs)); return; } } @@ -1182,14 +1553,19 @@ void qmp_eject(const char *device, bool has_force, bool force, Error **errp) eject_device(bs, force, errp); } -void qmp_block_passwd(const char *device, const char *password, Error **errp) +void qmp_block_passwd(bool has_device, const char *device, + bool has_node_name, const char *node_name, + const char *password, Error **errp) { + Error *local_err = NULL; BlockDriverState *bs; int err; - bs = bdrv_find(device); - if (!bs) { - error_set(errp, QERR_DEVICE_NOT_FOUND, device); + bs = bdrv_lookup_bs(has_device ? device : NULL, + has_node_name ? node_name : NULL, + &local_err); + if (local_err) { + error_propagate(errp, local_err); return; } @@ -1207,11 +1583,12 @@ static void qmp_bdrv_open_encrypted(BlockDriverState *bs, const char *filename, int bdrv_flags, BlockDriver *drv, const char *password, Error **errp) { + Error *local_err = NULL; int ret; - ret = bdrv_open(bs, filename, NULL, bdrv_flags, drv); + ret = bdrv_open(&bs, filename, NULL, NULL, bdrv_flags, drv, &local_err); if (ret < 0) { - error_setg_file_open(errp, -ret, filename); + error_propagate(errp, local_err); return; } @@ -1230,7 +1607,7 @@ static void qmp_bdrv_open_encrypted(BlockDriverState *bs, const char *filename, } void qmp_change_blockdev(const char *device, const char *filename, - bool has_format, const char *format, Error **errp) + const char *format, Error **errp) { BlockDriverState *bs; BlockDriver *drv = NULL; @@ -1252,7 +1629,7 @@ void qmp_change_blockdev(const char *device, const char *filename, } eject_device(bs, 0, &err); - if (error_is_set(&err)) { + if (err) { error_propagate(errp, err); return; } @@ -1265,10 +1642,26 @@ void qmp_change_blockdev(const char *device, const char *filename, /* throttling disk I/O limits */ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd, - int64_t bps_wr, int64_t iops, int64_t iops_rd, - int64_t iops_wr, Error **errp) + int64_t bps_wr, + int64_t iops, + int64_t iops_rd, + int64_t iops_wr, + bool has_bps_max, + int64_t bps_max, + bool has_bps_rd_max, + int64_t bps_rd_max, + bool has_bps_wr_max, + int64_t bps_wr_max, + bool has_iops_max, + int64_t iops_max, + bool has_iops_rd_max, + int64_t iops_rd_max, + bool has_iops_wr_max, + int64_t iops_wr_max, + bool has_iops_size, + int64_t iops_size, Error **errp) { - BlockIOLimit io_limits; + ThrottleConfig cfg; BlockDriverState *bs; bs = bdrv_find(device); @@ -1277,27 +1670,50 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd, return; } - io_limits.bps[BLOCK_IO_LIMIT_TOTAL] = bps; - io_limits.bps[BLOCK_IO_LIMIT_READ] = bps_rd; - io_limits.bps[BLOCK_IO_LIMIT_WRITE] = bps_wr; - io_limits.iops[BLOCK_IO_LIMIT_TOTAL]= iops; - io_limits.iops[BLOCK_IO_LIMIT_READ] = iops_rd; - io_limits.iops[BLOCK_IO_LIMIT_WRITE]= iops_wr; + memset(&cfg, 0, sizeof(cfg)); + cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps; + cfg.buckets[THROTTLE_BPS_READ].avg = bps_rd; + cfg.buckets[THROTTLE_BPS_WRITE].avg = bps_wr; - if (!do_check_io_limits(&io_limits, errp)) { - return; + cfg.buckets[THROTTLE_OPS_TOTAL].avg = iops; + cfg.buckets[THROTTLE_OPS_READ].avg = iops_rd; + cfg.buckets[THROTTLE_OPS_WRITE].avg = iops_wr; + + if (has_bps_max) { + cfg.buckets[THROTTLE_BPS_TOTAL].max = bps_max; + } + if (has_bps_rd_max) { + cfg.buckets[THROTTLE_BPS_READ].max = bps_rd_max; + } + if (has_bps_wr_max) { + cfg.buckets[THROTTLE_BPS_WRITE].max = bps_wr_max; + } + if (has_iops_max) { + cfg.buckets[THROTTLE_OPS_TOTAL].max = iops_max; + } + if (has_iops_rd_max) { + cfg.buckets[THROTTLE_OPS_READ].max = iops_rd_max; + } + if (has_iops_wr_max) { + cfg.buckets[THROTTLE_OPS_WRITE].max = iops_wr_max; + } + + if (has_iops_size) { + cfg.op_size = iops_size; } - bs->io_limits = io_limits; + if (!check_throttle_config(&cfg, errp)) { + return; + } - if (!bs->io_limits_enabled && bdrv_io_limits_enabled(bs)) { + if (!bs->io_limits_enabled && throttle_enabled(&cfg)) { bdrv_io_limits_enable(bs); - } else if (bs->io_limits_enabled && !bdrv_io_limits_enabled(bs)) { + } else if (bs->io_limits_enabled && !throttle_enabled(&cfg)) { bdrv_io_limits_disable(bs); - } else { - if (bs->block_timer) { - qemu_mod_timer(bs->block_timer, qemu_get_clock_ns(vm_clock)); - } + } + + if (bs->io_limits_enabled) { + bdrv_set_io_limits(bs, &cfg); } } @@ -1339,14 +1755,24 @@ int do_drive_del(Monitor *mon, const QDict *qdict, QObject **ret_data) return 0; } -void qmp_block_resize(const char *device, int64_t size, Error **errp) +void qmp_block_resize(bool has_device, const char *device, + bool has_node_name, const char *node_name, + int64_t size, Error **errp) { + Error *local_err = NULL; BlockDriverState *bs; int ret; - bs = bdrv_find(device); - if (!bs) { - error_set(errp, QERR_DEVICE_NOT_FOUND, device); + bs = bdrv_lookup_bs(has_device ? device : NULL, + has_node_name ? node_name : NULL, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + if (!bdrv_is_first_non_filter(bs)) { + error_set(errp, QERR_FEATURE_DISABLED, "resize"); return; } @@ -1401,7 +1827,7 @@ static void block_job_cb(void *opaque, int ret) } qobject_decref(obj); - drive_put_ref_bh_schedule(drive_get_by_blockdev(bs)); + bdrv_put_ref_bh_schedule(bs); } void qmp_block_stream(const char *device, bool has_base, @@ -1433,16 +1859,11 @@ void qmp_block_stream(const char *device, bool has_base, stream_start(bs, base_bs, base, has_speed ? speed : 0, on_error, block_job_cb, bs, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); return; } - /* Grab a reference so hotplug does not delete the BlockDriverState from - * underneath us. - */ - drive_get_ref(drive_get_by_blockdev(bs)); - trace_qmp_block_stream(bs, bs->job); } @@ -1459,6 +1880,10 @@ void qmp_block_commit(const char *device, */ BlockdevOnError on_error = BLOCKDEV_ON_ERROR_REPORT; + if (!has_speed) { + speed = 0; + } + /* drain all i/o before commits */ bdrv_drain_all(); @@ -1493,16 +1918,17 @@ void qmp_block_commit(const char *device, return; } - commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs, - &local_err); + if (top_bs == bs) { + commit_active_start(bs, base_bs, speed, on_error, block_job_cb, + bs, &local_err); + } else { + commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs, + &local_err); + } if (local_err != NULL) { error_propagate(errp, local_err); return; } - /* Grab a reference so hotplug does not delete the BlockDriverState from - * underneath us. - */ - drive_get_ref(drive_get_by_blockdev(bs)); } void qmp_drive_backup(const char *device, const char *target, @@ -1595,31 +2021,30 @@ void qmp_drive_backup(const char *device, const char *target, } } - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); return; } - target_bs = bdrv_new(""); - ret = bdrv_open(target_bs, target, NULL, flags, drv); + target_bs = NULL; + ret = bdrv_open(&target_bs, target, NULL, NULL, flags, drv, &local_err); if (ret < 0) { - bdrv_delete(target_bs); - error_setg_file_open(errp, -ret, target); + error_propagate(errp, local_err); return; } backup_start(bs, target_bs, speed, sync, on_source_error, on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { - bdrv_delete(target_bs); + bdrv_unref(target_bs); error_propagate(errp, local_err); return; } +} - /* Grab a reference so hotplug does not delete the BlockDriverState from - * underneath us. - */ - drive_get_ref(drive_get_by_blockdev(bs)); +BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp) +{ + return bdrv_named_nodes_list(); } #define DEFAULT_MIRROR_BUF_SIZE (10 << 20) @@ -1703,6 +2128,9 @@ void qmp_drive_mirror(const char *device, const char *target, if (!source && sync == MIRROR_SYNC_MODE_TOP) { sync = MIRROR_SYNC_MODE_FULL; } + if (sync == MIRROR_SYNC_MODE_NONE) { + source = bs; + } size = bdrv_getlength(bs); if (size < 0) { @@ -1710,7 +2138,9 @@ void qmp_drive_mirror(const char *device, const char *target, return; } - if (sync == MIRROR_SYNC_MODE_FULL && mode != NEW_IMAGE_MODE_EXISTING) { + if ((sync == MIRROR_SYNC_MODE_FULL || !source) + && mode != NEW_IMAGE_MODE_EXISTING) + { /* create new image w/o backing file */ assert(format && drv); bdrv_img_create(target, format, @@ -1718,7 +2148,6 @@ void qmp_drive_mirror(const char *device, const char *target, } else { switch (mode) { case NEW_IMAGE_MODE_EXISTING: - ret = 0; break; case NEW_IMAGE_MODE_ABSOLUTE_PATHS: /* create new image with backing file */ @@ -1732,7 +2161,7 @@ void qmp_drive_mirror(const char *device, const char *target, } } - if (error_is_set(&local_err)) { + if (local_err) { error_propagate(errp, local_err); return; } @@ -1740,11 +2169,11 @@ void qmp_drive_mirror(const char *device, const char *target, /* Mirroring takes care of copy-on-write using the source's backing * file. */ - target_bs = bdrv_new(""); - ret = bdrv_open(target_bs, target, NULL, flags | BDRV_O_NO_BACKING, drv); + target_bs = NULL; + ret = bdrv_open(&target_bs, target, NULL, NULL, flags | BDRV_O_NO_BACKING, + drv, &local_err); if (ret < 0) { - bdrv_delete(target_bs); - error_setg_file_open(errp, -ret, target); + error_propagate(errp, local_err); return; } @@ -1752,15 +2181,10 @@ void qmp_drive_mirror(const char *device, const char *target, on_source_error, on_target_error, block_job_cb, bs, &local_err); if (local_err != NULL) { - bdrv_delete(target_bs); + bdrv_unref(target_bs); error_propagate(errp, local_err); return; } - - /* Grab a reference so hotplug does not delete the BlockDriverState from - * underneath us. - */ - drive_get_ref(drive_get_by_blockdev(bs)); } static BlockJob *find_block_job(const char *device) @@ -1800,7 +2224,8 @@ void qmp_block_job_cancel(const char *device, return; } if (job->paused && !force) { - error_set(errp, QERR_BLOCK_JOB_PAUSED, device); + error_setg(errp, "The block job for device '%s' is currently paused", + device); return; } @@ -1847,6 +2272,63 @@ void qmp_block_job_complete(const char *device, Error **errp) block_job_complete(job, errp); } +void qmp_blockdev_add(BlockdevOptions *options, Error **errp) +{ + QmpOutputVisitor *ov = qmp_output_visitor_new(); + DriveInfo *dinfo; + QObject *obj; + QDict *qdict; + Error *local_err = NULL; + + /* Require an ID in the top level */ + if (!options->has_id) { + error_setg(errp, "Block device needs an ID"); + goto fail; + } + + /* TODO Sort it out in raw-posix and drive_init: Reject aio=native with + * cache.direct=false instead of silently switching to aio=threads, except + * if called from drive_init. + * + * For now, simply forbidding the combination for all drivers will do. */ + if (options->has_aio && options->aio == BLOCKDEV_AIO_OPTIONS_NATIVE) { + bool direct = options->has_cache && + options->cache->has_direct && + options->cache->direct; + if (!direct) { + error_setg(errp, "aio=native requires cache.direct=true"); + goto fail; + } + } + + visit_type_BlockdevOptions(qmp_output_get_visitor(ov), + &options, NULL, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto fail; + } + + obj = qmp_output_get_qobject(ov); + qdict = qobject_to_qdict(obj); + + qdict_flatten(qdict); + + dinfo = blockdev_init(NULL, qdict, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto fail; + } + + if (bdrv_key_required(dinfo->bdrv)) { + drive_uninit(dinfo); + error_setg(errp, "blockdev-add doesn't support encrypted devices"); + goto fail; + } + +fail: + qmp_output_visitor_cleanup(ov); +} + static void do_qmp_query_block_jobs_one(void *opaque, BlockDriverState *bs) { BlockJobInfoList **prev = opaque; @@ -1874,49 +2356,9 @@ QemuOptsList qemu_common_drive_opts = { .head = QTAILQ_HEAD_INITIALIZER(qemu_common_drive_opts.head), .desc = { { - .name = "bus", - .type = QEMU_OPT_NUMBER, - .help = "bus number", - },{ - .name = "unit", - .type = QEMU_OPT_NUMBER, - .help = "unit number (i.e. lun for scsi)", - },{ - .name = "if", - .type = QEMU_OPT_STRING, - .help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)", - },{ - .name = "index", - .type = QEMU_OPT_NUMBER, - .help = "index number", - },{ - .name = "cyls", - .type = QEMU_OPT_NUMBER, - .help = "number of cylinders (ide disk geometry)", - },{ - .name = "heads", - .type = QEMU_OPT_NUMBER, - .help = "number of heads (ide disk geometry)", - },{ - .name = "secs", - .type = QEMU_OPT_NUMBER, - .help = "number of sectors (ide disk geometry)", - },{ - .name = "trans", - .type = QEMU_OPT_STRING, - .help = "chs translation (auto, lba. none)", - },{ - .name = "media", - .type = QEMU_OPT_STRING, - .help = "media type (disk, cdrom)", - },{ .name = "snapshot", .type = QEMU_OPT_BOOL, .help = "enable/disable snapshot mode", - },{ - .name = "file", - .type = QEMU_OPT_STRING, - .help = "disk image", },{ .name = "discard", .type = QEMU_OPT_STRING, @@ -1953,10 +2395,6 @@ QemuOptsList qemu_common_drive_opts = { .name = "werror", .type = QEMU_OPT_STRING, .help = "write error action", - },{ - .name = "addr", - .type = QEMU_OPT_STRING, - .help = "pci address (virtio only)", },{ .name = "read-only", .type = QEMU_OPT_BOOL, @@ -1986,135 +2424,37 @@ QemuOptsList qemu_common_drive_opts = { .type = QEMU_OPT_NUMBER, .help = "limit write bytes per second", },{ - .name = "copy-on-read", - .type = QEMU_OPT_BOOL, - .help = "copy read data from backing file into image file", - },{ - .name = "boot", - .type = QEMU_OPT_BOOL, - .help = "(deprecated, ignored)", - }, - { /* end of list */ } - }, -}; - -QemuOptsList qemu_old_drive_opts = { - .name = "drive", - .head = QTAILQ_HEAD_INITIALIZER(qemu_old_drive_opts.head), - .desc = { - { - .name = "bus", - .type = QEMU_OPT_NUMBER, - .help = "bus number", - },{ - .name = "unit", - .type = QEMU_OPT_NUMBER, - .help = "unit number (i.e. lun for scsi)", - },{ - .name = "if", - .type = QEMU_OPT_STRING, - .help = "interface (ide, scsi, sd, mtd, floppy, pflash, virtio)", - },{ - .name = "index", - .type = QEMU_OPT_NUMBER, - .help = "index number", - },{ - .name = "cyls", - .type = QEMU_OPT_NUMBER, - .help = "number of cylinders (ide disk geometry)", - },{ - .name = "heads", - .type = QEMU_OPT_NUMBER, - .help = "number of heads (ide disk geometry)", - },{ - .name = "secs", + .name = "throttling.iops-total-max", .type = QEMU_OPT_NUMBER, - .help = "number of sectors (ide disk geometry)", - },{ - .name = "trans", - .type = QEMU_OPT_STRING, - .help = "chs translation (auto, lba. none)", - },{ - .name = "media", - .type = QEMU_OPT_STRING, - .help = "media type (disk, cdrom)", - },{ - .name = "snapshot", - .type = QEMU_OPT_BOOL, - .help = "enable/disable snapshot mode", - },{ - .name = "file", - .type = QEMU_OPT_STRING, - .help = "disk image", - },{ - .name = "discard", - .type = QEMU_OPT_STRING, - .help = "discard operation (ignore/off, unmap/on)", - },{ - .name = "cache", - .type = QEMU_OPT_STRING, - .help = "host cache usage (none, writeback, writethrough, " - "directsync, unsafe)", - },{ - .name = "aio", - .type = QEMU_OPT_STRING, - .help = "host AIO implementation (threads, native)", - },{ - .name = "format", - .type = QEMU_OPT_STRING, - .help = "disk format (raw, qcow2, ...)", - },{ - .name = "serial", - .type = QEMU_OPT_STRING, - .help = "disk serial number", - },{ - .name = "rerror", - .type = QEMU_OPT_STRING, - .help = "read error action", - },{ - .name = "werror", - .type = QEMU_OPT_STRING, - .help = "write error action", - },{ - .name = "addr", - .type = QEMU_OPT_STRING, - .help = "pci address (virtio only)", - },{ - .name = "readonly", - .type = QEMU_OPT_BOOL, - .help = "open drive file as read-only", + .help = "I/O operations burst", },{ - .name = "iops", + .name = "throttling.iops-read-max", .type = QEMU_OPT_NUMBER, - .help = "limit total I/O operations per second", + .help = "I/O operations read burst", },{ - .name = "iops_rd", + .name = "throttling.iops-write-max", .type = QEMU_OPT_NUMBER, - .help = "limit read operations per second", + .help = "I/O operations write burst", },{ - .name = "iops_wr", + .name = "throttling.bps-total-max", .type = QEMU_OPT_NUMBER, - .help = "limit write operations per second", + .help = "total bytes burst", },{ - .name = "bps", + .name = "throttling.bps-read-max", .type = QEMU_OPT_NUMBER, - .help = "limit total bytes per second", + .help = "total bytes read burst", },{ - .name = "bps_rd", + .name = "throttling.bps-write-max", .type = QEMU_OPT_NUMBER, - .help = "limit read bytes per second", + .help = "total bytes write burst", },{ - .name = "bps_wr", + .name = "throttling.iops-size", .type = QEMU_OPT_NUMBER, - .help = "limit write bytes per second", + .help = "when limiting by iops max size of an I/O in bytes", },{ .name = "copy-on-read", .type = QEMU_OPT_BOOL, .help = "copy read data from backing file into image file", - },{ - .name = "boot", - .type = QEMU_OPT_BOOL, - .help = "(deprecated, ignored)", }, { /* end of list */ } }, diff --git a/blockjob.c b/blockjob.c index ca80df1d0e..cd4784f053 100644 --- a/blockjob.c +++ b/blockjob.c @@ -35,7 +35,7 @@ #include "qmp-commands.h" #include "qemu/timer.h" -void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, +void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs, int64_t speed, BlockDriverCompletionFunc *cb, void *opaque, Error **errp) { @@ -45,10 +45,11 @@ void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, error_set(errp, QERR_DEVICE_IN_USE, bdrv_get_device_name(bs)); return NULL; } + bdrv_ref(bs); bdrv_set_in_use(bs, 1); - job = g_malloc0(job_type->instance_size); - job->job_type = job_type; + job = g_malloc0(driver->instance_size); + job->driver = driver; job->bs = bs; job->cb = cb; job->opaque = opaque; @@ -60,7 +61,7 @@ void *block_job_create(const BlockJobType *job_type, BlockDriverState *bs, Error *local_err = NULL; block_job_set_speed(job, speed, &local_err); - if (error_is_set(&local_err)) { + if (local_err) { bs->job = NULL; g_free(job); bdrv_set_in_use(bs, 0); @@ -86,12 +87,12 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) { Error *local_err = NULL; - if (!job->job_type->set_speed) { - error_set(errp, QERR_NOT_SUPPORTED); + if (!job->driver->set_speed) { + error_set(errp, QERR_UNSUPPORTED); return; } - job->job_type->set_speed(job, speed, &local_err); - if (error_is_set(&local_err)) { + job->driver->set_speed(job, speed, &local_err); + if (local_err) { error_propagate(errp, local_err); return; } @@ -101,12 +102,12 @@ void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp) void block_job_complete(BlockJob *job, Error **errp) { - if (job->paused || job->cancelled || !job->job_type->complete) { + if (job->paused || job->cancelled || !job->driver->complete) { error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name); return; } - job->job_type->complete(job, errp); + job->driver->complete(job, errp); } void block_job_pause(BlockJob *job) @@ -142,8 +143,8 @@ bool block_job_is_cancelled(BlockJob *job) void block_job_iostatus_reset(BlockJob *job) { job->iostatus = BLOCK_DEVICE_IO_STATUS_OK; - if (job->job_type->iostatus_reset) { - job->job_type->iostatus_reset(job); + if (job->driver->iostatus_reset) { + job->driver->iostatus_reset(job); } } @@ -187,7 +188,7 @@ int block_job_cancel_sync(BlockJob *job) return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret; } -void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns) +void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns) { assert(job->busy); @@ -200,7 +201,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns) if (block_job_is_paused(job)) { qemu_coroutine_yield(); } else { - co_sleep_ns(clock, ns); + co_sleep_ns(type, ns); } job->busy = true; } @@ -208,7 +209,7 @@ void block_job_sleep_ns(BlockJob *job, QEMUClock *clock, int64_t ns) BlockJobInfo *block_job_query(BlockJob *job) { BlockJobInfo *info = g_new0(BlockJobInfo, 1); - info->type = g_strdup(job->job_type->job_type); + info->type = g_strdup(BlockJobType_lookup[job->driver->job_type]); info->device = g_strdup(bdrv_get_device_name(job->bs)); info->len = job->len; info->busy = job->busy; @@ -235,7 +236,7 @@ QObject *qobject_from_block_job(BlockJob *job) "'len': %" PRId64 "," "'offset': %" PRId64 "," "'speed': %" PRId64 " }", - job->job_type->job_type, + BlockJobType_lookup[job->driver->job_type], bdrv_get_device_name(job->bs), job->len, job->offset, diff --git a/bsd-user/main.c b/bsd-user/main.c index f9246aa102..4ba61da896 100644 --- a/bsd-user/main.c +++ b/bsd-user/main.c @@ -43,7 +43,7 @@ unsigned long reserved_va; #endif static const char *interp_prefix = CONFIG_QEMU_INTERP_PREFIX; -const char *qemu_uname_release = CONFIG_UNAME_RELEASE; +const char *qemu_uname_release; extern char **environ; enum BSDType bsd_type; @@ -1000,11 +1000,9 @@ int main(int argc, char **argv) memset(ts, 0, sizeof(TaskState)); init_task_state(ts); ts->info = info; - env->opaque = ts; + cpu->opaque = ts; #if defined(TARGET_I386) - cpu_x86_set_cpl(env, 3); - env->cr[0] = CR0_PG_MASK | CR0_WP_MASK | CR0_PE_MASK; env->hflags |= HF_PE_MASK; if (env->features[FEAT_1_EDX] & CPUID_SSE) { diff --git a/bsd-user/qemu.h b/bsd-user/qemu.h index 325f564f80..ddc74ed0d7 100644 --- a/bsd-user/qemu.h +++ b/bsd-user/qemu.h @@ -323,9 +323,9 @@ abi_long copy_from_user(void *hptr, abi_ulong gaddr, size_t len); abi_long copy_to_user(abi_ulong gaddr, void *hptr, size_t len); /* Functions for accessing guest memory. The tget and tput functions - read/write single values, byteswapping as necessary. The lock_user + read/write single values, byteswapping as necessary. The lock_user function gets a pointer to a contiguous area of guest memory, but does not perform - and byteswapping. lock_user may return either a pointer to the guest + any byteswapping. lock_user may return either a pointer to the guest memory, or a temporary buffer. */ /* Lock an area of guest memory into the host. If copy is true then the @@ -381,7 +381,7 @@ static inline void *lock_user_string(abi_ulong guest_addr) return lock_user(VERIFY_READ, guest_addr, (long)(len + 1), 1); } -/* Helper macros for locking/ulocking a target struct. */ +/* Helper macros for locking/unlocking a target struct. */ #define lock_user_struct(type, host_ptr, guest_addr, copy) \ (host_ptr = lock_user(type, guest_addr, sizeof(*host_ptr), copy)) #define unlock_user_struct(host_ptr, guest_addr, copy) \ diff --git a/configure b/configure index 18fa60824b..154a7378d6 100755 --- a/configure +++ b/configure @@ -12,12 +12,16 @@ else fi TMPC="${TMPDIR1}/qemu-conf-${RANDOM}-$$-${RANDOM}.c" -TMPO="${TMPDIR1}/qemu-conf-${RANDOM}-$$-${RANDOM}.o" +TMPB="qemu-conf-${RANDOM}-$$-${RANDOM}" +TMPO="${TMPDIR1}/${TMPB}.o" +TMPCXX="${TMPDIR1}/${TMPB}.cxx" +TMPL="${TMPDIR1}/${TMPB}.lo" +TMPA="${TMPDIR1}/lib${TMPB}.la" TMPE="${TMPDIR1}/qemu-conf-${RANDOM}-$$-${RANDOM}.exe" # NB: do not call "exit" in the trap handler; this is buggy with some shells; # see <1285349658-3122-1-git-send-email-loic.minier@linaro.org> -trap "rm -f $TMPC $TMPO $TMPE" EXIT INT QUIT TERM +trap "rm -f $TMPC $TMPO $TMPCXX $TMPE" EXIT INT QUIT TERM rm -f config.log # Print a helpful header at the top of config.log @@ -38,10 +42,13 @@ error_exit() { exit 1 } -do_cc() { - # Run the compiler, capturing its output to the log. - echo $cc "$@" >> config.log - $cc "$@" >> config.log 2>&1 || return $? +do_compiler() { + # Run the compiler, capturing its output to the log. First argument + # is compiler binary to execute. + local compiler="$1" + shift + echo $compiler "$@" >> config.log + $compiler "$@" >> config.log 2>&1 || return $? # Test passed. If this is an --enable-werror build, rerun # the test with -Werror and bail out if it fails. This # makes warning-generating-errors in configure test code @@ -55,14 +62,39 @@ do_cc() { return 0 ;; esac - echo $cc -Werror "$@" >> config.log - $cc -Werror "$@" >> config.log 2>&1 && return $? + echo $compiler -Werror "$@" >> config.log + $compiler -Werror "$@" >> config.log 2>&1 && return $? error_exit "configure test passed without -Werror but failed with -Werror." \ "This is probably a bug in the configure script. The failing command" \ "will be at the bottom of config.log." \ "You can run configure with --disable-werror to bypass this check." } +do_cc() { + do_compiler "$cc" "$@" +} + +do_cxx() { + do_compiler "$cxx" "$@" +} + +update_cxxflags() { + # Set QEMU_CXXFLAGS from QEMU_CFLAGS by filtering out those + # options which some versions of GCC's C++ compiler complain about + # because they only make sense for C programs. + QEMU_CXXFLAGS= + for arg in $QEMU_CFLAGS; do + case $arg in + -Wstrict-prototypes|-Wmissing-prototypes|-Wnested-externs|\ + -Wold-style-declaration|-Wold-style-definition|-Wredundant-decls) + ;; + *) + QEMU_CXXFLAGS=${QEMU_CXXFLAGS:+$QEMU_CXXFLAGS }$arg + ;; + esac + done +} + compile_object() { do_cc $QEMU_CFLAGS -c -o $TMPO $TMPC } @@ -73,6 +105,38 @@ compile_prog() { do_cc $QEMU_CFLAGS $local_cflags -o $TMPE $TMPC $LDFLAGS $local_ldflags } +do_libtool() { + local mode=$1 + shift + # Run the compiler, capturing its output to the log. + echo $libtool $mode --tag=CC $cc "$@" >> config.log + $libtool $mode --tag=CC $cc "$@" >> config.log 2>&1 || return $? + # Test passed. If this is an --enable-werror build, rerun + # the test with -Werror and bail out if it fails. This + # makes warning-generating-errors in configure test code + # obvious to developers. + if test "$werror" != "yes"; then + return 0 + fi + # Don't bother rerunning the compile if we were already using -Werror + case "$*" in + *-Werror*) + return 0 + ;; + esac + echo $libtool $mode --tag=CC $cc -Werror "$@" >> config.log + $libtool $mode --tag=CC $cc -Werror "$@" >> config.log 2>&1 && return $? + error_exit "configure test passed without -Werror but failed with -Werror." \ + "This is probably a bug in the configure script. The failing command" \ + "will be at the bottom of config.log." \ + "You can run configure with --disable-werror to bypass this check." +} + +libtool_prog() { + do_libtool --mode=compile $QEMU_CFLAGS -c -fPIE -DPIE -o $TMPO $TMPC || return $? + do_libtool --mode=link $LDFLAGS -o $TMPA $TMPL -rpath /usr/local/lib +} + # symbolically link $1 to $2. Portable version of "ln -sf". symlink() { rm -rf "$2" @@ -119,6 +183,7 @@ path_of() { # default parameters source_path=`dirname "$0"` cpu="" +iasl="iasl" interp_prefix="/usr/gnemul/qemu-%M" static="no" cross_prefix="" @@ -133,6 +198,7 @@ audio_win_int="" cc_i386=i386-pc-linux-gnu-gcc libs_qga="" debug_info="yes" +stack_protector="" # Don't accept a target_list environment variable. unset target_list @@ -155,8 +221,10 @@ curl="" curses="" docs="" fdt="" +netmap="no" pixman="" sdl="" +sdlabi="1.2" virtfs="" vnc="yes" sparse="no" @@ -190,6 +258,9 @@ mingw32="no" gcov="no" gcov_tool="gcov" EXESUF="" +DSOSUF=".so" +LDFLAGS_SHARED="-shared" +modules="no" prefix="/usr/local" mandir="\${prefix}/share/man" datadir="\${prefix}/share" @@ -214,8 +285,6 @@ softmmu="yes" linux_user="no" bsd_user="no" guest_base="yes" -uname_release="" -mixemu="no" aix="no" blobs="yes" pkgversion="" @@ -231,18 +300,30 @@ libusb="" usb_redir="" glx="" zlib="yes" +lzo="no" +snappy="no" guest_agent="" +guest_agent_with_vss="no" +vss_win32_sdk="" +win_sdk="no" want_tools="yes" libiscsi="" +libnfs="" coroutine="" +coroutine_pool="" seccomp="" glusterfs="" glusterfs_discard="no" +glusterfs_zerofill="no" virtio_blk_data_plane="" gtk="" gtkabi="2.0" +vte="" tpm="no" libssh2="" +vhdx="" +quorum="no" +numa="" # parse CC options first for opt do @@ -252,6 +333,8 @@ for opt do ;; --cc=*) CC="$optarg" ;; + --cxx=*) CXX="$optarg" + ;; --source-path=*) source_path="$optarg" ;; --cpu=*) cpu="$optarg" @@ -282,6 +365,12 @@ else cc="${CC-${cross_prefix}gcc}" fi +if test -z "${CXX}${cross_prefix}"; then + cxx="c++" +else + cxx="${CXX-${cross_prefix}g++}" +fi + ar="${AR-${cross_prefix}ar}" as="${AS-${cross_prefix}as}" cpp="${CPP-$cc -E}" @@ -296,9 +385,13 @@ query_pkg_config() { } pkg_config=query_pkg_config sdl_config="${SDL_CONFIG-${cross_prefix}sdl-config}" +sdl2_config="${SDL2_CONFIG-${cross_prefix}sdl2-config}" + +# If the user hasn't specified ARFLAGS, default to 'rv', just as make does. +ARFLAGS="${ARFLAGS-rv}" # default flags for all hosts -QEMU_CFLAGS="-fno-strict-aliasing $QEMU_CFLAGS" +QEMU_CFLAGS="-fno-strict-aliasing -fno-common $QEMU_CFLAGS" QEMU_CFLAGS="-Wall -Wundef -Wwrite-strings -Wmissing-prototypes $QEMU_CFLAGS" QEMU_CFLAGS="-Wstrict-prototypes -Wredundant-decls $QEMU_CFLAGS" QEMU_CFLAGS="-D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE $QEMU_CFLAGS" @@ -311,6 +404,14 @@ fi # make source path absolute source_path=`cd "$source_path"; pwd` +# running configure in the source tree? +# we know that's the case if configure is there. +if test -f "./configure"; then + pwd_is_source_path="y" +else + pwd_is_source_path="n" +fi + check_define() { cat > $TMPC <= (3,))'; then + error_exit "Cannot use '$python', Python 2.4 or later is required." \ + "Note that Python 3 or later is not yet supported." \ + "Use --python=/path/to/python to specify a supported Python." +fi + +# The -B switch was added in Python 2.6. +# If it is supplied, compiled files are not written. +# Use it for Python versions which support it. +if $python -B -c 'import sys; sys.exit(0)' 2>/dev/null; then + python="$python -B" +fi + case "$cpu" in + ppc) + CPU_CFLAGS="-m32" + LDFLAGS="-m32 $LDFLAGS" + ;; + ppc64) + CPU_CFLAGS="-m64" + LDFLAGS="-m64 $LDFLAGS" + ;; sparc) LDFLAGS="-m32 $LDFLAGS" CPU_CFLAGS="-m32 -mcpu=ultrasparc" @@ -960,11 +1142,11 @@ case "$cpu" in CPU_CFLAGS="-m64 -mcpu=ultrasparc" ;; s390) - CPU_CFLAGS="-m31 -march=z990" + CPU_CFLAGS="-m31" LDFLAGS="-m31 $LDFLAGS" ;; s390x) - CPU_CFLAGS="-m64 -march=z990" + CPU_CFLAGS="-m64" LDFLAGS="-m64 $LDFLAGS" ;; i386) @@ -977,6 +1159,11 @@ case "$cpu" in LDFLAGS="-m64 $LDFLAGS" cc_i386='$(CC) -m32' ;; + x32) + CPU_CFLAGS="-mx32" + LDFLAGS="-mx32 $LDFLAGS" + cc_i386='$(CC) -m32' + ;; # No special flags required for other host CPUs esac @@ -1007,161 +1194,181 @@ cat << EOF Usage: configure [options] Options: [defaults in brackets after descriptions] +Standard options: + --help print this message + --prefix=PREFIX install in PREFIX [$prefix] + --interp-prefix=PREFIX where to find shared libraries, etc. + use %M for cpu name [$interp_prefix] + --target-list=LIST set target list (default: build everything) +$(echo Available targets: $default_target_list | \ + fold -s -w 53 | sed -e 's/^/ /') + +Advanced options (experts only): + --source-path=PATH path of source code [$source_path] + --cross-prefix=PREFIX use PREFIX for compile tools [$cross_prefix] + --cc=CC use C compiler CC [$cc] + --iasl=IASL use ACPI compiler IASL [$iasl] + --host-cc=CC use C compiler CC [$host_cc] for code run at + build time + --cxx=CXX use C++ compiler CXX [$cxx] + --objcc=OBJCC use Objective-C compiler OBJCC [$objcc] + --extra-cflags=CFLAGS append extra C compiler flags QEMU_CFLAGS + --extra-ldflags=LDFLAGS append extra linker flags LDFLAGS + --make=MAKE use specified make [$make] + --install=INSTALL use specified install [$install] + --python=PYTHON use specified python [$python] + --smbd=SMBD use specified smbd [$smbd] + --static enable static build [$static] + --mandir=PATH install man pages in PATH + --datadir=PATH install firmware in PATH$confsuffix + --docdir=PATH install documentation in PATH$confsuffix + --bindir=PATH install binaries in PATH + --libdir=PATH install libraries in PATH + --sysconfdir=PATH install config in PATH$confsuffix + --localstatedir=PATH install local state in PATH (set at runtime on win32) + --with-confsuffix=SUFFIX suffix for QEMU data inside datadir/libdir/sysconfdir [$confsuffix] + --enable-modules enable modules support + --enable-debug-tcg enable TCG debugging + --disable-debug-tcg disable TCG debugging (default) + --enable-debug-info enable debugging information (default) + --disable-debug-info disable debugging information + --enable-debug enable common debug build options + --enable-sparse enable sparse checker + --disable-sparse disable sparse checker (default) + --disable-strip disable stripping binaries + --disable-werror disable compilation abort on warning + --disable-stack-protector disable compiler-provided stack protection + --disable-sdl disable SDL + --enable-sdl enable SDL + --with-sdlabi select preferred SDL ABI 1.2 or 2.0 + --disable-gtk disable gtk UI + --enable-gtk enable gtk UI + --with-gtkabi select preferred GTK ABI 2.0 or 3.0 + --disable-virtfs disable VirtFS + --enable-virtfs enable VirtFS + --disable-vnc disable VNC + --enable-vnc enable VNC + --disable-cocoa disable Cocoa (Mac OS X only) + --enable-cocoa enable Cocoa (default on Mac OS X) + --audio-drv-list=LIST set audio drivers list: + Available drivers: $audio_possible_drivers + --block-drv-whitelist=L Same as --block-drv-rw-whitelist=L + --block-drv-rw-whitelist=L + set block driver read-write whitelist + (affects only QEMU, not qemu-img) + --block-drv-ro-whitelist=L + set block driver read-only whitelist + (affects only QEMU, not qemu-img) + --disable-xen disable xen backend driver support + --enable-xen enable xen backend driver support + --disable-xen-pci-passthrough + --enable-xen-pci-passthrough + --disable-brlapi disable BrlAPI + --enable-brlapi enable BrlAPI + --disable-vnc-tls disable TLS encryption for VNC server + --enable-vnc-tls enable TLS encryption for VNC server + --disable-vnc-sasl disable SASL encryption for VNC server + --enable-vnc-sasl enable SASL encryption for VNC server + --disable-vnc-jpeg disable JPEG lossy compression for VNC server + --enable-vnc-jpeg enable JPEG lossy compression for VNC server + --disable-vnc-png disable PNG compression for VNC server (default) + --enable-vnc-png enable PNG compression for VNC server + --disable-vnc-ws disable Websockets support for VNC server + --enable-vnc-ws enable Websockets support for VNC server + --disable-curses disable curses output + --enable-curses enable curses output + --disable-curl disable curl connectivity + --enable-curl enable curl connectivity + --disable-fdt disable fdt device tree + --enable-fdt enable fdt device tree + --disable-bluez disable bluez stack connectivity + --enable-bluez enable bluez stack connectivity + --disable-slirp disable SLIRP userspace network connectivity + --disable-kvm disable KVM acceleration support + --enable-kvm enable KVM acceleration support + --disable-rdma disable RDMA-based migration support + --enable-rdma enable RDMA-based migration support + --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI) + --enable-system enable all system emulation targets + --disable-system disable all system emulation targets + --enable-user enable supported user emulation targets + --disable-user disable all user emulation targets + --enable-linux-user enable all linux usermode emulation targets + --disable-linux-user disable all linux usermode emulation targets + --enable-bsd-user enable all BSD usermode emulation targets + --disable-bsd-user disable all BSD usermode emulation targets + --enable-guest-base enable GUEST_BASE support for usermode + emulation targets + --disable-guest-base disable GUEST_BASE support + --enable-pie build Position Independent Executables + --disable-pie do not build Position Independent Executables + --fmod-lib path to FMOD library + --fmod-inc path to FMOD includes + --oss-lib path to OSS library + --cpu=CPU Build for host CPU [$cpu] + --disable-uuid disable uuid support + --enable-uuid enable uuid support + --disable-vde disable support for vde network + --enable-vde enable support for vde network + --disable-netmap disable support for netmap network + --enable-netmap enable support for netmap network + --disable-linux-aio disable Linux AIO support + --enable-linux-aio enable Linux AIO support + --disable-cap-ng disable libcap-ng support + --enable-cap-ng enable libcap-ng support + --disable-attr disables attr and xattr support + --enable-attr enable attr and xattr support + --disable-blobs disable installing provided firmware blobs + --enable-docs enable documentation build + --disable-docs disable documentation build + --disable-vhost-net disable vhost-net acceleration support + --enable-vhost-net enable vhost-net acceleration support + --enable-trace-backend=B Set trace backend + Available backends: $($python $source_path/scripts/tracetool.py --list-backends) + --with-trace-file=NAME Full PATH,NAME of file to store traces + Default:trace- + --disable-spice disable spice + --enable-spice enable spice + --enable-rbd enable building the rados block device (rbd) + --disable-libiscsi disable iscsi support + --enable-libiscsi enable iscsi support + --disable-libnfs disable nfs support + --enable-libnfs enable nfs support + --disable-smartcard-nss disable smartcard nss support + --enable-smartcard-nss enable smartcard nss support + --disable-libusb disable libusb (for usb passthrough) + --enable-libusb enable libusb (for usb passthrough) + --disable-usb-redir disable usb network redirection support + --enable-usb-redir enable usb network redirection support + --enable-lzo enable the support of lzo compression library + --enable-snappy enable the support of snappy compression library + --disable-guest-agent disable building of the QEMU Guest Agent + --enable-guest-agent enable building of the QEMU Guest Agent + --with-vss-sdk=SDK-path enable Windows VSS support in QEMU Guest Agent + --with-win-sdk=SDK-path path to Windows Platform SDK (to build VSS .tlb) + --disable-seccomp disable seccomp support + --enable-seccomp enables seccomp support + --with-coroutine=BACKEND coroutine backend. Supported options: + gthread, ucontext, sigaltstack, windows + --disable-coroutine-pool disable coroutine freelist (worse performance) + --enable-coroutine-pool enable coroutine freelist (better performance) + --enable-glusterfs enable GlusterFS backend + --disable-glusterfs disable GlusterFS backend + --enable-gcov enable test coverage analysis with gcov + --gcov=GCOV use specified gcov [$gcov_tool] + --enable-tpm enable TPM support + --disable-libssh2 disable ssh block device support + --enable-libssh2 enable ssh block device support + --disable-vhdx disables support for the Microsoft VHDX image format + --enable-vhdx enable support for the Microsoft VHDX image format + --disable-quorum disable quorum block filter support + --enable-quorum enable quorum block filter support + --disable-numa disable libnuma support + --enable-numa enable libnuma support + +NOTE: The object files are built at the place where configure is launched EOF -echo "Standard options:" -echo " --help print this message" -echo " --prefix=PREFIX install in PREFIX [$prefix]" -echo " --interp-prefix=PREFIX where to find shared libraries, etc." -echo " use %M for cpu name [$interp_prefix]" -echo " --target-list=LIST set target list (default: build everything)" -echo "Available targets: $default_target_list" | \ - fold -s -w 53 | sed -e 's/^/ /' -echo "" -echo "Advanced options (experts only):" -echo " --source-path=PATH path of source code [$source_path]" -echo " --cross-prefix=PREFIX use PREFIX for compile tools [$cross_prefix]" -echo " --cc=CC use C compiler CC [$cc]" -echo " --host-cc=CC use C compiler CC [$host_cc] for code run at" -echo " build time" -echo " --objcc=OBJCC use Objective-C compiler OBJCC [$objcc]" -echo " --extra-cflags=CFLAGS append extra C compiler flags QEMU_CFLAGS" -echo " --extra-ldflags=LDFLAGS append extra linker flags LDFLAGS" -echo " --make=MAKE use specified make [$make]" -echo " --install=INSTALL use specified install [$install]" -echo " --python=PYTHON use specified python [$python]" -echo " --smbd=SMBD use specified smbd [$smbd]" -echo " --static enable static build [$static]" -echo " --mandir=PATH install man pages in PATH" -echo " --datadir=PATH install firmware in PATH$confsuffix" -echo " --docdir=PATH install documentation in PATH$confsuffix" -echo " --bindir=PATH install binaries in PATH" -echo " --libdir=PATH install libraries in PATH" -echo " --sysconfdir=PATH install config in PATH$confsuffix" -echo " --localstatedir=PATH install local state in PATH (set at runtime on win32)" -echo " --with-confsuffix=SUFFIX suffix for QEMU data inside datadir and sysconfdir [$confsuffix]" -echo " --enable-debug-tcg enable TCG debugging" -echo " --disable-debug-tcg disable TCG debugging (default)" -echo " --enable-debug-info enable debugging information (default)" -echo " --disable-debug-info disable debugging information" -echo " --enable-debug enable common debug build options" -echo " --enable-sparse enable sparse checker" -echo " --disable-sparse disable sparse checker (default)" -echo " --disable-strip disable stripping binaries" -echo " --disable-werror disable compilation abort on warning" -echo " --disable-sdl disable SDL" -echo " --enable-sdl enable SDL" -echo " --disable-gtk disable gtk UI" -echo " --enable-gtk enable gtk UI" -echo " --disable-virtfs disable VirtFS" -echo " --enable-virtfs enable VirtFS" -echo " --disable-vnc disable VNC" -echo " --enable-vnc enable VNC" -echo " --disable-cocoa disable Cocoa (Mac OS X only)" -echo " --enable-cocoa enable Cocoa (default on Mac OS X)" -echo " --audio-drv-list=LIST set audio drivers list:" -echo " Available drivers: $audio_possible_drivers" -echo " --block-drv-whitelist=L Same as --block-drv-rw-whitelist=L" -echo " --block-drv-rw-whitelist=L" -echo " set block driver read-write whitelist" -echo " (affects only QEMU, not qemu-img)" -echo " --block-drv-ro-whitelist=L" -echo " set block driver read-only whitelist" -echo " (affects only QEMU, not qemu-img)" -echo " --enable-mixemu enable mixer emulation" -echo " --disable-xen disable xen backend driver support" -echo " --enable-xen enable xen backend driver support" -echo " --disable-xen-pci-passthrough" -echo " --enable-xen-pci-passthrough" -echo " --disable-brlapi disable BrlAPI" -echo " --enable-brlapi enable BrlAPI" -echo " --disable-vnc-tls disable TLS encryption for VNC server" -echo " --enable-vnc-tls enable TLS encryption for VNC server" -echo " --disable-vnc-sasl disable SASL encryption for VNC server" -echo " --enable-vnc-sasl enable SASL encryption for VNC server" -echo " --disable-vnc-jpeg disable JPEG lossy compression for VNC server" -echo " --enable-vnc-jpeg enable JPEG lossy compression for VNC server" -echo " --disable-vnc-png disable PNG compression for VNC server (default)" -echo " --enable-vnc-png enable PNG compression for VNC server" -echo " --disable-vnc-ws disable Websockets support for VNC server" -echo " --enable-vnc-ws enable Websockets support for VNC server" -echo " --disable-curses disable curses output" -echo " --enable-curses enable curses output" -echo " --disable-curl disable curl connectivity" -echo " --enable-curl enable curl connectivity" -echo " --disable-fdt disable fdt device tree" -echo " --enable-fdt enable fdt device tree" -echo " --disable-bluez disable bluez stack connectivity" -echo " --enable-bluez enable bluez stack connectivity" -echo " --disable-slirp disable SLIRP userspace network connectivity" -echo " --disable-kvm disable KVM acceleration support" -echo " --enable-kvm enable KVM acceleration support" -echo " --disable-rdma disable RDMA-based migration support" -echo " --enable-rdma enable RDMA-based migration support" -echo " --enable-tcg-interpreter enable TCG with bytecode interpreter (TCI)" -echo " --enable-system enable all system emulation targets" -echo " --disable-system disable all system emulation targets" -echo " --enable-user enable supported user emulation targets" -echo " --disable-user disable all user emulation targets" -echo " --enable-linux-user enable all linux usermode emulation targets" -echo " --disable-linux-user disable all linux usermode emulation targets" -echo " --enable-bsd-user enable all BSD usermode emulation targets" -echo " --disable-bsd-user disable all BSD usermode emulation targets" -echo " --enable-guest-base enable GUEST_BASE support for usermode" -echo " emulation targets" -echo " --disable-guest-base disable GUEST_BASE support" -echo " --enable-pie build Position Independent Executables" -echo " --disable-pie do not build Position Independent Executables" -echo " --fmod-lib path to FMOD library" -echo " --fmod-inc path to FMOD includes" -echo " --oss-lib path to OSS library" -echo " --enable-uname-release=R Return R for uname -r in usermode emulation" -echo " --cpu=CPU Build for host CPU [$cpu]" -echo " --disable-uuid disable uuid support" -echo " --enable-uuid enable uuid support" -echo " --disable-vde disable support for vde network" -echo " --enable-vde enable support for vde network" -echo " --disable-linux-aio disable Linux AIO support" -echo " --enable-linux-aio enable Linux AIO support" -echo " --disable-cap-ng disable libcap-ng support" -echo " --enable-cap-ng enable libcap-ng support" -echo " --disable-attr disables attr and xattr support" -echo " --enable-attr enable attr and xattr support" -echo " --disable-blobs disable installing provided firmware blobs" -echo " --enable-docs enable documentation build" -echo " --disable-docs disable documentation build" -echo " --disable-vhost-net disable vhost-net acceleration support" -echo " --enable-vhost-net enable vhost-net acceleration support" -echo " --enable-trace-backend=B Set trace backend" -echo " Available backends:" $($python "$source_path"/scripts/tracetool.py --list-backends) -echo " --with-trace-file=NAME Full PATH,NAME of file to store traces" -echo " Default:trace-" -echo " --disable-spice disable spice" -echo " --enable-spice enable spice" -echo " --enable-rbd enable building the rados block device (rbd)" -echo " --disable-libiscsi disable iscsi support" -echo " --enable-libiscsi enable iscsi support" -echo " --disable-smartcard-nss disable smartcard nss support" -echo " --enable-smartcard-nss enable smartcard nss support" -echo " --disable-libusb disable libusb (for usb passthrough)" -echo " --enable-libusb enable libusb (for usb passthrough)" -echo " --disable-usb-redir disable usb network redirection support" -echo " --enable-usb-redir enable usb network redirection support" -echo " --disable-guest-agent disable building of the QEMU Guest Agent" -echo " --enable-guest-agent enable building of the QEMU Guest Agent" -echo " --disable-seccomp disable seccomp support" -echo " --enable-seccomp enables seccomp support" -echo " --with-coroutine=BACKEND coroutine backend. Supported options:" -echo " gthread, ucontext, sigaltstack, windows" -echo " --enable-glusterfs enable GlusterFS backend" -echo " --disable-glusterfs disable GlusterFS backend" -echo " --enable-gcov enable test coverage analysis with gcov" -echo " --gcov=GCOV use specified gcov [$gcov_tool]" -echo " --enable-tpm enable TPM support" -echo " --disable-libssh2 disable ssh block device support" -echo " --enable-libssh2 enable ssh block device support" -echo "" -echo "NOTE: The object files are built at the place where configure is launched" -exit 1 +exit 0 fi # Now we have handled --enable-tcg-interpreter and know we're not just @@ -1175,6 +1382,19 @@ if test "$ARCH" = "unknown"; then fi fi +# Consult white-list to determine whether to enable werror +# by default. Only enable by default for git builds +z_version=`cut -f3 -d. $source_path/VERSION` + +if test -z "$werror" ; then + if test -d "$source_path/.git" -a \ + "$linux" = "yes" ; then + werror="yes" + else + werror="no" + fi +fi + # check that the C compiler works. cat > $TMPC < $TMPC < $TMPCXX <.o compile and some only at link time + if do_cc $QEMU_CFLAGS -Werror $flag -c -o $TMPO $TMPC && + compile_prog "-Werror $flag" ""; then + QEMU_CFLAGS="$QEMU_CFLAGS $flag" + LIBTOOLFLAGS="$LIBTOOLFLAGS -Wc,$flag" + break + fi + done fi # Workaround for http://gcc.gnu.org/PR55489. Happens with -fPIE/-fPIC and @@ -1242,6 +1490,9 @@ if compile_prog "-Werror -fno-gcse" "" ; then fi if test "$static" = "yes" ; then + if test "$modules" = "yes" ; then + error_exit "static and modules are mutually incompatible" + fi if test "$pie" = "yes" ; then error_exit "static and pie are mutually incompatible" else @@ -1251,7 +1502,7 @@ fi if test "$pie" = ""; then case "$cpu-$targetos" in - i386-Linux|x86_64-Linux|i386-OpenBSD|x86_64-OpenBSD) + i386-Linux|x86_64-Linux|x32-Linux|i386-OpenBSD|x86_64-OpenBSD) ;; *) pie="no" @@ -1288,6 +1539,37 @@ EOF pie="no" fi fi + + if compile_prog "-fno-pie" "-nopie"; then + CFLAGS_NOPIE="-fno-pie" + LDFLAGS_NOPIE="-nopie" + fi +fi + +# check for broken gcc and libtool in RHEL5 +if test -n "$libtool" -a "$pie" != "no" ; then + cat > $TMPC <= (3,))'; then - error_exit "Cannot use '$python', Python 2.4 or later is required." \ - "Note that Python 3 or later is not yet supported." \ - "Use --python=/path/to/python to specify a supported Python." -fi - if test -z "${target_list+xxx}" ; then target_list="$default_target_list" else @@ -1382,44 +1652,34 @@ esac feature_not_found() { feature=$1 + remedy=$2 error_exit "User requested feature $feature" \ - "configure was not able to find it" + "configure was not able to find it." \ + "$remedy" } -if test -z "$cross_prefix" ; then - # --- # big/little endian test cat > $TMPC << EOF -#include -int main(void) { - volatile uint32_t i=0x01234567; - return (*((uint8_t*)(&i))) == 0x67; +short big_endian[] = { 0x4269, 0x4765, 0x4e64, 0x4961, 0x4e00, 0, }; +short little_endian[] = { 0x694c, 0x7454, 0x654c, 0x6e45, 0x6944, 0x6e41, 0, }; +extern int foo(short *, short *); +int main(int argc, char *argv[]) { + return foo(big_endian, little_endian); } EOF -if compile_prog "" "" ; then -$TMPE && bigendian="yes" -else -echo big/little test failed -fi - -else - -# if cross compiling, cannot launch a program, so make a static guess -case "$cpu" in - arm) - # ARM can be either way; ask the compiler which one we are - if check_define __ARMEB__; then - bigendian=yes +if compile_object ; then + if grep -q BiGeNdIaN $TMPO ; then + bigendian="yes" + elif grep -q LiTtLeEnDiAn $TMPO ; then + bigendian="no" + else + echo big/little test failed fi - ;; - hppa|m68k|mips|mips64|ppc|ppc64|s390|s390x|sparc|sparc64) - bigendian=yes - ;; -esac - +else + echo big/little test failed fi ########################################## @@ -1444,7 +1704,7 @@ int main(void) { } EOF if ! compile_object ; then - feature_not_found "nptl" + feature_not_found "nptl" "Install glibc and linux kernel headers." fi fi @@ -1463,19 +1723,55 @@ EOF "Make sure to have the zlib libs and headers installed." fi fi -libs_softmmu="$libs_softmmu -lz" +LIBS="$LIBS -lz" + +########################################## +# lzo check + +if test "$lzo" != "no" ; then + cat > $TMPC << EOF +#include +int main(void) { lzo_version(); return 0; } +EOF + if compile_prog "" "-llzo2" ; then + : + else + error_exit "lzo check failed" \ + "Make sure to have the lzo libs and headers installed." + fi + + libs_softmmu="$libs_softmmu -llzo2" +fi + +########################################## +# snappy check + +if test "$snappy" != "no" ; then + cat > $TMPC << EOF +#include +int main(void) { snappy_max_compressed_length(4096); return 0; } +EOF + if compile_prog "" "-lsnappy" ; then + : + else + error_exit "snappy check failed" \ + "Make sure to have the snappy libs and headers installed." + fi + + libs_softmmu="$libs_softmmu -lsnappy" +fi ########################################## # libseccomp check if test "$seccomp" != "no" ; then - if $pkg_config --atleast-version=2.1.0 libseccomp --modversion >/dev/null 2>&1; then + if $pkg_config --atleast-version=2.1.0 libseccomp; then libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`" QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`" seccomp="yes" else if test "$seccomp" = "yes"; then - feature_not_found "libseccomp" + feature_not_found "libseccomp" "Install libseccomp devel >= 2.1.0" fi seccomp="no" fi @@ -1500,7 +1796,7 @@ EOF if ! compile_prog "" "$xen_libs" ; then # Xen not found if test "$xen" = "yes" ; then - feature_not_found "xen" + feature_not_found "xen" "Install xen devel" fi xen=no @@ -1623,7 +1919,7 @@ EOF # Xen version unsupported else if test "$xen" = "yes" ; then - feature_not_found "xen (unsupported version)" + feature_not_found "xen (unsupported version)" "Install supported xen (e.g. 4.0, 3.4, 3.3)" fi xen=no fi @@ -1672,7 +1968,7 @@ if test "$sparse" != "no" ; then sparse=yes else if test "$sparse" = "yes" ; then - feature_not_found "sparse" + feature_not_found "sparse" "Install sparse binary" fi sparse=no fi @@ -1685,30 +1981,41 @@ if test "$gtk" != "no"; then gtkpackage="gtk+-$gtkabi" if test "$gtkabi" = "3.0" ; then gtkversion="3.0.0" + else + gtkversion="2.18.0" + fi + if $pkg_config --exists "$gtkpackage >= $gtkversion"; then + gtk_cflags=`$pkg_config --cflags $gtkpackage` + gtk_libs=`$pkg_config --libs $gtkpackage` + libs_softmmu="$gtk_libs $libs_softmmu" + gtk="yes" + elif test "$gtk" = "yes"; then + feature_not_found "gtk" "Install gtk2 or gtk3 (requires --with-gtkabi=3.0 option to configure) devel" + else + gtk="no" + fi +fi + +########################################## +# VTE probe + +if test "$vte" != "no"; then + if test "$gtkabi" = "3.0"; then vtepackage="vte-2.90" vteversion="0.32.0" else - gtkversion="2.18.0" vtepackage="vte" vteversion="0.24.0" fi - if ! $pkg_config --exists "$gtkpackage >= $gtkversion"; then - if test "$gtk" = "yes" ; then - feature_not_found "gtk" - fi - gtk="no" - elif ! $pkg_config --exists "$vtepackage >= $vteversion"; then - if test "$gtk" = "yes" ; then - error_exit "libvte not found (required for gtk support)" - fi - gtk="no" + if $pkg_config --exists "$vtepackage >= $vteversion"; then + vte_cflags=`$pkg_config --cflags $vtepackage` + vte_libs=`$pkg_config --libs $vtepackage` + libs_softmmu="$vte_libs $libs_softmmu" + vte="yes" + elif test "$vte" = "yes"; then + feature_not_found "vte" "Install libvte or libvte-2.90 (requires --with-gtkabi=3.0 option to configure) devel" else - gtk_cflags=`$pkg_config --cflags $gtkpackage 2>/dev/null` - gtk_libs=`$pkg_config --libs $gtkpackage 2>/dev/null` - vte_cflags=`$pkg_config --cflags $vtepackage 2>/dev/null` - vte_libs=`$pkg_config --libs $vtepackage 2>/dev/null` - libs_softmmu="$gtk_libs $vte_libs $libs_softmmu" - gtk="yes" + vte="no" fi fi @@ -1717,19 +2024,29 @@ fi # Look for sdl configuration program (pkg-config or sdl-config). Try # sdl-config even without cross prefix, and favour pkg-config over sdl-config. -if test "`basename $sdl_config`" != sdl-config && ! has ${sdl_config}; then - sdl_config=sdl-config + +if test $sdlabi = "2.0"; then + sdl_config=$sdl2_config + sdlname=sdl2 + sdlconfigname=sdl2_config +else + sdlname=sdl + sdlconfigname=sdl_config +fi + +if test "`basename $sdl_config`" != $sdlconfigname && ! has ${sdl_config}; then + sdl_config=$sdlconfigname fi -if $pkg_config sdl --modversion >/dev/null 2>&1; then - sdlconfig="$pkg_config sdl" +if $pkg_config $sdlname --exists; then + sdlconfig="$pkg_config $sdlname" _sdlversion=`$sdlconfig --modversion 2>/dev/null | sed 's/[^0-9]//g'` elif has ${sdl_config}; then sdlconfig="$sdl_config" _sdlversion=`$sdlconfig --version | sed 's/[^0-9]//g'` else if test "$sdl" = "yes" ; then - feature_not_found "sdl" + feature_not_found "sdl" "Install SDL devel" fi sdl=no fi @@ -1773,7 +2090,7 @@ EOF fi # static link else # sdl not found if test "$sdl" = "yes" ; then - feature_not_found "sdl" + feature_not_found "sdl" "Install SDL devel" fi sdl=no fi # sdl compile test @@ -1839,16 +2156,40 @@ EOF QEMU_CFLAGS="$QEMU_CFLAGS $vnc_tls_cflags" else if test "$vnc_tls" = "yes" ; then - feature_not_found "vnc-tls" + feature_not_found "vnc-tls" "Install gnutls devel" fi if test "$vnc_ws" = "yes" ; then - feature_not_found "vnc-ws" + feature_not_found "vnc-ws" "Install gnutls devel" fi vnc_tls=no vnc_ws=no fi fi +########################################## +# Quorum probe (check for gnutls) +if test "$quorum" != "no" ; then +cat > $TMPC < +#include +int main(void) {char data[4096], digest[32]; +gnutls_hash_fast(GNUTLS_DIG_SHA256, data, 4096, digest); +return 0; +} +EOF +quorum_tls_cflags=`$pkg_config --cflags gnutls 2> /dev/null` +quorum_tls_libs=`$pkg_config --libs gnutls 2> /dev/null` +if compile_prog "$quorum_tls_cflags" "$quorum_tls_libs" ; then + qcow_tls=yes + libs_softmmu="$quorum_tls_libs $libs_softmmu" + libs_tools="$quorum_tls_libs $libs_softmmu" + QEMU_CFLAGS="$QEMU_CFLAGS $quorum_tls_cflags" +else + echo "gnutls > 2.10.0 required to compile Quorum" + exit 1 +fi +fi + ########################################## # VNC SASL detection if test "$vnc" = "yes" -a "$vnc_sasl" != "no" ; then @@ -1866,7 +2207,7 @@ EOF QEMU_CFLAGS="$QEMU_CFLAGS $vnc_sasl_cflags" else if test "$vnc_sasl" = "yes" ; then - feature_not_found "vnc-sasl" + feature_not_found "vnc-sasl" "Install Cyrus SASL devel" fi vnc_sasl=no fi @@ -1888,7 +2229,7 @@ EOF QEMU_CFLAGS="$QEMU_CFLAGS $vnc_jpeg_cflags" else if test "$vnc_jpeg" = "yes" ; then - feature_not_found "vnc-jpeg" + feature_not_found "vnc-jpeg" "Install libjpeg-turbo devel" fi vnc_jpeg=no fi @@ -1907,9 +2248,9 @@ int main(void) { return png_ptr != 0; } EOF - if $pkg_config libpng --modversion >/dev/null 2>&1; then - vnc_png_cflags=`$pkg_config libpng --cflags 2> /dev/null` - vnc_png_libs=`$pkg_config libpng --libs 2> /dev/null` + if $pkg_config libpng --exists; then + vnc_png_cflags=`$pkg_config libpng --cflags` + vnc_png_libs=`$pkg_config libpng --libs` else vnc_png_cflags="" vnc_png_libs="-lpng" @@ -1920,7 +2261,7 @@ EOF QEMU_CFLAGS="$QEMU_CFLAGS $vnc_png_cflags" else if test "$vnc_png" = "yes" ; then - feature_not_found "vnc-png" + feature_not_found "vnc-png" "Install libpng devel" fi vnc_png=no fi @@ -1964,12 +2305,24 @@ EOF libs_tools="$uuid_libs $libs_tools" else if test "$uuid" = "yes" ; then - feature_not_found "uuid" + feature_not_found "uuid" "Install libuuid devel" fi uuid=no fi fi +if test "$vhdx" = "yes" ; then + if test "$uuid" = "no" ; then + error_exit "uuid required for VHDX support" + fi +elif test "$vhdx" != "no" ; then + if test "$uuid" = "yes" ; then + vhdx=yes + else + vhdx=no + fi +fi + ########################################## # xfsctl() probe, used for raw-posix if test "$xfs" != "no" ; then @@ -1986,7 +2339,7 @@ EOF xfs="yes" else if test "$xfs" = "yes" ; then - feature_not_found "xfs" + feature_not_found "xfs" "Instal xfsprogs/xfslibs devel" fi xfs=no fi @@ -2012,12 +2365,40 @@ EOF libs_tools="$vde_libs $libs_tools" else if test "$vde" = "yes" ; then - feature_not_found "vde" + feature_not_found "vde" "Install vde (Virtual Distributed Ethernet) devel" fi vde=no fi fi +########################################## +# netmap support probe +# Apart from looking for netmap headers, we make sure that the host API version +# supports the netmap backend (>=11). The upper bound (15) is meant to simulate +# a minor/major version number. Minor new features will be marked with values up +# to 15, and if something happens that requires a change to the backend we will +# move above 15, submit the backend fixes and modify this two bounds. +if test "$netmap" != "no" ; then + cat > $TMPC << EOF +#include +#include +#include +#include +#if (NETMAP_API < 11) || (NETMAP_API > 15) +#error +#endif +int main(void) { return 0; } +EOF + if compile_prog "" "" ; then + netmap=yes + else + if test "$netmap" = "yes" ; then + feature_not_found "netmap" + fi + netmap=no + fi +fi + ########################################## # libcap-ng library probe if test "$cap_ng" != "no" ; then @@ -2035,7 +2416,7 @@ EOF libs_tools="$cap_libs $libs_tools" else if test "$cap_ng" = "yes" ; then - feature_not_found "cap_ng" + feature_not_found "cap_ng" "Install libcap-ng devel" fi cap_ng=no fi @@ -2140,7 +2521,7 @@ EOF libs_softmmu="$brlapi_libs $libs_softmmu" else if test "$brlapi" = "yes" ; then - feature_not_found "brlapi" + feature_not_found "brlapi" "Install brlapi devel" fi brlapi=no fi @@ -2177,7 +2558,7 @@ EOF curses=yes else if test "$curses" = "yes" ; then - feature_not_found "curses" + feature_not_found "curses" "Install ncurses devel" fi curses=no fi @@ -2186,7 +2567,7 @@ fi ########################################## # curl probe if test "$curl" != "no" ; then - if $pkg_config libcurl --modversion >/dev/null 2>&1; then + if $pkg_config libcurl --exists; then curlconfig="$pkg_config libcurl" else curlconfig=curl-config @@ -2199,11 +2580,9 @@ EOF curl_libs=`$curlconfig --libs 2>/dev/null` if compile_prog "$curl_cflags" "$curl_libs" ; then curl=yes - libs_tools="$curl_libs $libs_tools" - libs_softmmu="$curl_libs $libs_softmmu" else if test "$curl" = "yes" ; then - feature_not_found "curl" + feature_not_found "curl" "Install libcurl devel" fi curl=no fi @@ -2223,7 +2602,7 @@ EOF libs_softmmu="$bluez_libs $libs_softmmu" else if test "$bluez" = "yes" ; then - feature_not_found "bluez" + feature_not_found "bluez" "Install bluez-libs/libbluetooth devel" fi bluez="no" fi @@ -2238,14 +2617,36 @@ if test "$mingw32" = yes; then else glib_req_ver=2.12 fi -if $pkg_config --atleast-version=$glib_req_ver gthread-2.0 > /dev/null 2>&1 -then - glib_cflags=`$pkg_config --cflags gthread-2.0 2>/dev/null` - glib_libs=`$pkg_config --libs gthread-2.0 2>/dev/null` - LIBS="$glib_libs $LIBS" - libs_qga="$glib_libs $libs_qga" -else - error_exit "glib-$glib_req_ver required to compile QEMU" +glib_modules=gthread-2.0 +if test "$modules" = yes; then + glib_modules="$glib_modules gmodule-2.0" +fi + +for i in $glib_modules; do + if $pkg_config --atleast-version=$glib_req_ver $i; then + glib_cflags=`$pkg_config --cflags $i` + glib_libs=`$pkg_config --libs $i` + CFLAGS="$glib_cflags $CFLAGS" + LIBS="$glib_libs $LIBS" + libs_qga="$glib_libs $libs_qga" + else + error_exit "glib-$glib_req_ver $i is required to compile QEMU" + fi +done + +########################################## +# SHA command probe for modules +if test "$modules" = yes; then + shacmd_probe="sha1sum sha1 shasum" + for c in $shacmd_probe; do + if which $c >/dev/null 2>&1; then + shacmd="$c" + break + fi + done + if test "$shacmd" = ""; then + error_exit "one of the checksum commands is required to enable modules: $shacmd_probe" + fi fi ########################################## @@ -2270,8 +2671,8 @@ if test "$pixman" = "none"; then pixman_cflags= pixman_libs= elif test "$pixman" = "system"; then - pixman_cflags=`$pkg_config --cflags pixman-1 2>/dev/null` - pixman_libs=`$pkg_config --libs pixman-1 2>/dev/null` + pixman_cflags=`$pkg_config --cflags pixman-1` + pixman_libs=`$pkg_config --libs pixman-1` else if test ! -d ${source_path}/pixman/pixman; then error_exit "pixman not present. Your options:" \ @@ -2341,6 +2742,24 @@ if test "$mingw32" != yes -a "$pthread" = no; then "Make sure to have the pthread libs and headers installed." fi +# check for pthread_setname_np +pthread_setname_np=no +cat > $TMPC << EOF +#include + +static void *f(void *p) { return NULL; } +int main(void) +{ + pthread_t thread; + pthread_create(&thread, 0, f, 0); + pthread_setname_np(thread, "QEMU"); + return 0; +} +EOF +if compile_prog "" "$pthread_lib" ; then + pthread_setname_np=yes +fi + ########################################## # rbd probe if test "$rbd" != "no" ; then @@ -2356,11 +2775,9 @@ EOF rbd_libs="-lrbd -lrados" if compile_prog "" "$rbd_libs" ; then rbd=yes - libs_tools="$rbd_libs $libs_tools" - libs_softmmu="$rbd_libs $libs_softmmu" else if test "$rbd" = "yes" ; then - feature_not_found "rados block device" + feature_not_found "rados block device" "Install librbd/ceph devel" fi rbd=no fi @@ -2370,14 +2787,10 @@ fi # libssh2 probe min_libssh2_version=1.2.8 if test "$libssh2" != "no" ; then - if $pkg_config --atleast-version=$min_libssh2_version libssh2 >/dev/null 2>&1 - then + if $pkg_config --atleast-version=$min_libssh2_version libssh2; then libssh2_cflags=`$pkg_config libssh2 --cflags` libssh2_libs=`$pkg_config libssh2 --libs` libssh2=yes - libs_tools="$libssh2_libs $libs_tools" - libs_softmmu="$libssh2_libs $libs_softmmu" - QEMU_CFLAGS="$QEMU_CFLAGS $libssh2_cflags" else if test "$libssh2" = "yes" ; then error_exit "libssh2 >= $min_libssh2_version required for --enable-libssh2" @@ -2423,11 +2836,9 @@ int main(void) { io_setup(0, NULL); io_set_eventfd(NULL, 0); eventfd(0, 0); retu EOF if compile_prog "" "-laio" ; then linux_aio=yes - libs_softmmu="$libs_softmmu -laio" - libs_tools="$libs_tools -laio" else if test "$linux_aio" = "yes" ; then - feature_not_found "linux AIO" + feature_not_found "linux AIO" "Install libaio devel" fi linux_aio=no fi @@ -2475,7 +2886,7 @@ EOF libattr=yes else if test "$attr" = "yes" ; then - feature_not_found "ATTR" + feature_not_found "ATTR" "Install libc6 or libattr devel" fi attr=no fi @@ -2514,7 +2925,7 @@ fi fdt_required=no for target in $target_list; do case $target in - arm*-softmmu|ppc*-softmmu|microblaze*-softmmu) + aarch64*-softmmu|arm*-softmmu|ppc*-softmmu|microblaze*-softmmu) fdt_required=yes ;; esac @@ -2544,7 +2955,7 @@ EOF fdt=yes dtc_internal="yes" mkdir -p dtc - if [ "$source_path" != `pwd` ] ; then + if [ "$pwd_is_source_path" != "y" ] ; then symlink "$source_path/dtc/Makefile" "dtc/Makefile" symlink "$source_path/dtc/scripts" "dtc/scripts" fi @@ -2552,8 +2963,8 @@ EOF fdt_libs="-L\$(BUILD_DIR)/dtc/libfdt $fdt_libs" elif test "$fdt" = "yes" ; then # have neither and want - prompt for system/submodule install - error_exit "DTC not present. Your options:" \ - " (1) Preferred: Install the DTC devel package" \ + error_exit "DTC (libfdt) not present. Your options:" \ + " (1) Preferred: Install the DTC (libfdt) devel package" \ " (2) Fetch the DTC submodule, using:" \ " git submodule update --init dtc" else @@ -2579,7 +2990,7 @@ EOF glx=yes else if test "$glx" = "yes" ; then - feature_not_found "glx" + feature_not_found "glx" "Install GL devel (e.g. MESA)" fi glx_libs= glx=no @@ -2589,19 +3000,19 @@ fi ########################################## # glusterfs probe if test "$glusterfs" != "no" ; then - if $pkg_config --atleast-version=3 glusterfs-api >/dev/null 2>&1; then + if $pkg_config --atleast-version=3 glusterfs-api; then glusterfs="yes" - glusterfs_cflags=`$pkg_config --cflags glusterfs-api 2>/dev/null` - glusterfs_libs=`$pkg_config --libs glusterfs-api 2>/dev/null` - CFLAGS="$CFLAGS $glusterfs_cflags" - libs_tools="$glusterfs_libs $libs_tools" - libs_softmmu="$glusterfs_libs $libs_softmmu" - if $pkg_config --atleast-version=5 glusterfs-api >/dev/null 2>&1; then + glusterfs_cflags=`$pkg_config --cflags glusterfs-api` + glusterfs_libs=`$pkg_config --libs glusterfs-api` + if $pkg_config --atleast-version=5 glusterfs-api; then glusterfs_discard="yes" fi + if $pkg_config --atleast-version=6 glusterfs-api; then + glusterfs_zerofill="yes" + fi else if test "$glusterfs" = "yes" ; then - feature_not_found "GlusterFS backend support" + feature_not_found "GlusterFS backend support" "Install glusterfs-api devel" fi glusterfs="no" fi @@ -2712,6 +3123,26 @@ if compile_prog "" "" ; then splice=yes fi +########################################## +# libnuma probe + +if test "$numa" != "no" ; then + cat > $TMPC << EOF +#include +int main(void) { return numa_available(); } +EOF + + if compile_prog "" "-lnuma" ; then + numa=yes + libs_softmmu="-lnuma $libs_softmmu" + else + if test "$numa" = "yes" ; then + feature_not_found "numa" "install numactl devel" + fi + numa=no + fi +fi + ########################################## # signalfd probe signalfd="no" @@ -2818,6 +3249,37 @@ if compile_prog "" "" ; then dup3=yes fi +# check for ppoll support +ppoll=no +cat > $TMPC << EOF +#include + +int main(void) +{ + struct pollfd pfd = { .fd = 0, .events = 0, .revents = 0 }; + ppoll(&pfd, 1, 0, 0); + return 0; +} +EOF +if compile_prog "" "" ; then + ppoll=yes +fi + +# check for prctl(PR_SET_TIMERSLACK , ... ) support +prctl_pr_set_timerslack=no +cat > $TMPC << EOF +#include + +int main(void) +{ + prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0); + return 0; +} +EOF +if compile_prog "" "" ; then + prctl_pr_set_timerslack=yes +fi + # check for epoll support epoll=no cat > $TMPC << EOF @@ -2890,7 +3352,7 @@ if test "$docs" != "no" ; then docs=yes else if test "$docs" = "yes" ; then - feature_not_found "docs" + feature_not_found "docs" "Install texinfo and Perl/perl-podlators" fi docs=no fi @@ -2920,31 +3382,44 @@ fi ########################################## # Do we have libiscsi -# We check for iscsi_unmap_sync() to make sure we have a -# recent enough version of libiscsi. +# We check for iscsi_write16_sync() to make sure we have a +# at least version 1.4.0 of libiscsi. if test "$libiscsi" != "no" ; then cat > $TMPC << EOF #include #include -int main(void) { iscsi_unmap_sync(NULL,0,0,0,NULL,0); return 0; } +int main(void) { iscsi_write16_sync(NULL,0,0,NULL,0,0,0,0,0,0,0); return 0; } EOF - if $pkg_config --atleast-version=1.7.0 libiscsi --modversion >/dev/null 2>&1; then + if $pkg_config --atleast-version=1.7.0 libiscsi; then libiscsi="yes" - libiscsi_cflags=$($pkg_config --cflags libiscsi 2>/dev/null) - libiscsi_libs=$($pkg_config --libs libiscsi 2>/dev/null) - CFLAGS="$CFLAGS $libiscsi_cflags" - LIBS="$LIBS $libiscsi_libs" + libiscsi_cflags=$($pkg_config --cflags libiscsi) + libiscsi_libs=$($pkg_config --libs libiscsi) elif compile_prog "" "-liscsi" ; then libiscsi="yes" - LIBS="$LIBS -liscsi" + libiscsi_libs="-liscsi" else if test "$libiscsi" = "yes" ; then - feature_not_found "libiscsi" + feature_not_found "libiscsi" "Install libiscsi devel" fi libiscsi="no" fi fi +# We also need to know the API version because there was an +# API change from 1.4.0 to 1.5.0. +if test "$libiscsi" = "yes"; then + cat >$TMPC < +int main(void) +{ + iscsi_read10_task(0, 0, 0, 0, 0, 0, 0); + return 0; +} +EOF + if compile_prog "" "-liscsi"; then + libiscsi_version="1.4.0" + fi +fi ########################################## # Do we need libm @@ -2998,8 +3473,8 @@ int main(void) { spice_server_new(); return 0; } EOF spice_cflags=$($pkg_config --cflags spice-protocol spice-server 2>/dev/null) spice_libs=$($pkg_config --libs spice-protocol spice-server 2>/dev/null) - if $pkg_config --atleast-version=0.12.0 spice-server >/dev/null 2>&1 && \ - $pkg_config --atleast-version=0.12.3 spice-protocol > /dev/null 2>&1 && \ + if $pkg_config --atleast-version=0.12.0 spice-server && \ + $pkg_config --atleast-version=0.12.3 spice-protocol && \ compile_prog "$spice_cflags" "$spice_libs" ; then spice="yes" libs_softmmu="$libs_softmmu $spice_libs" @@ -3008,7 +3483,7 @@ EOF spice_server_version=$($pkg_config --modversion spice-server) else if test "$spice" = "yes" ; then - feature_not_found "spice" + feature_not_found "spice" "Install spice-server and spice-protocol devel" fi spice="no" fi @@ -3022,10 +3497,10 @@ if test "$smartcard_nss" != "no"; then #include int main(void) { PK11_FreeSlot(0); return 0; } EOF - smartcard_includes="-I\$(SRC_PATH)/libcacard" - libcacard_libs="$($pkg_config --libs nss 2>/dev/null) $glib_libs" - libcacard_cflags="$($pkg_config --cflags nss 2>/dev/null) $glib_cflags" - test_cflags="$libcacard_cflags" + # FIXME: do not include $glib_* in here + nss_libs="$($pkg_config --libs nss 2>/dev/null) $glib_libs" + nss_cflags="$($pkg_config --cflags nss 2>/dev/null) $glib_cflags" + test_cflags="$nss_cflags" # The header files in nss < 3.13.3 have a bug which causes them to # emit a warning. If we're going to compile QEMU with -Werror, then # test that the headers don't have this bug. Otherwise we would pass @@ -3034,12 +3509,9 @@ EOF test_cflags="-Werror $test_cflags" fi if test -n "$libtool" && - $pkg_config --atleast-version=3.12.8 nss >/dev/null 2>&1 && \ - compile_prog "$test_cflags" "$libcacard_libs"; then + $pkg_config --atleast-version=3.12.8 nss && \ + compile_prog "$test_cflags" "$nss_libs"; then smartcard_nss="yes" - QEMU_CFLAGS="$QEMU_CFLAGS $libcacard_cflags" - QEMU_INCLUDES="$QEMU_INCLUDES $smartcard_includes" - libs_softmmu="$libcacard_libs $libs_softmmu" else if test "$smartcard_nss" = "yes"; then feature_not_found "nss" @@ -3050,16 +3522,15 @@ fi # check for libusb if test "$libusb" != "no" ; then - if $pkg_config --atleast-version=1.0.13 libusb-1.0 >/dev/null 2>&1 ; then + if $pkg_config --atleast-version=1.0.13 libusb-1.0; then libusb="yes" - usb="libusb" - libusb_cflags=$($pkg_config --cflags libusb-1.0 2>/dev/null) - libusb_libs=$($pkg_config --libs libusb-1.0 2>/dev/null) + libusb_cflags=$($pkg_config --cflags libusb-1.0) + libusb_libs=$($pkg_config --libs libusb-1.0) QEMU_CFLAGS="$QEMU_CFLAGS $libusb_cflags" libs_softmmu="$libs_softmmu $libusb_libs" else if test "$libusb" = "yes"; then - feature_not_found "libusb" + feature_not_found "libusb" "Install libusb devel" fi libusb="no" fi @@ -3067,20 +3538,75 @@ fi # check for usbredirparser for usb network redirection support if test "$usb_redir" != "no" ; then - if $pkg_config --atleast-version=0.6 libusbredirparser-0.5 >/dev/null 2>&1 ; then + if $pkg_config --atleast-version=0.6 libusbredirparser-0.5; then usb_redir="yes" - usb_redir_cflags=$($pkg_config --cflags libusbredirparser-0.5 2>/dev/null) - usb_redir_libs=$($pkg_config --libs libusbredirparser-0.5 2>/dev/null) + usb_redir_cflags=$($pkg_config --cflags libusbredirparser-0.5) + usb_redir_libs=$($pkg_config --libs libusbredirparser-0.5) QEMU_CFLAGS="$QEMU_CFLAGS $usb_redir_cflags" libs_softmmu="$libs_softmmu $usb_redir_libs" else if test "$usb_redir" = "yes"; then - feature_not_found "usb-redir" + feature_not_found "usb-redir" "Install usbredir devel" fi usb_redir="no" fi fi +########################################## +# check if we have VSS SDK headers for win + +if test "$mingw32" = "yes" -a "$guest_agent" != "no" -a "$vss_win32_sdk" != "no" ; then + case "$vss_win32_sdk" in + "") vss_win32_include="-I$source_path" ;; + *\ *) # The SDK is installed in "Program Files" by default, but we cannot + # handle path with spaces. So we symlink the headers into ".sdk/vss". + vss_win32_include="-I$source_path/.sdk/vss" + symlink "$vss_win32_sdk/inc" "$source_path/.sdk/vss/inc" + ;; + *) vss_win32_include="-I$vss_win32_sdk" + esac + cat > $TMPC << EOF +#define __MIDL_user_allocate_free_DEFINED__ +#include +int main(void) { return VSS_CTX_BACKUP; } +EOF + if compile_prog "$vss_win32_include" "" ; then + guest_agent_with_vss="yes" + QEMU_CFLAGS="$QEMU_CFLAGS $vss_win32_include" + libs_qga="-lole32 -loleaut32 -lshlwapi -luuid -lstdc++ -Wl,--enable-stdcall-fixup $libs_qga" + else + if test "$vss_win32_sdk" != "" ; then + echo "ERROR: Please download and install Microsoft VSS SDK:" + echo "ERROR: http://www.microsoft.com/en-us/download/details.aspx?id=23490" + echo "ERROR: On POSIX-systems, you can extract the SDK headers by:" + echo "ERROR: scripts/extract-vsssdk-headers setup.exe" + echo "ERROR: The headers are extracted in the directory \`inc'." + feature_not_found "VSS support" + fi + guest_agent_with_vss="no" + fi +fi + +########################################## +# lookup Windows platform SDK (if not specified) +# The SDK is needed only to build .tlb (type library) file of guest agent +# VSS provider from the source. It is usually unnecessary because the +# pre-compiled .tlb file is included. + +if test "$mingw32" = "yes" -a "$guest_agent" != "no" -a "$guest_agent_with_vss" = "yes" ; then + if test -z "$win_sdk"; then + programfiles="$PROGRAMFILES" + test -n "$PROGRAMW6432" && programfiles="$PROGRAMW6432" + if test -n "$programfiles"; then + win_sdk=$(ls -d "$programfiles/Microsoft SDKs/Windows/v"* | tail -1) 2>/dev/null + else + feature_not_found "Windows SDK" + fi + elif test "$win_sdk" = "no"; then + win_sdk="" + fi +fi + ########################################## ########################################## @@ -3159,15 +3685,25 @@ fi # For 'ust' backend, test if ust headers are present if test "$trace_backend" = "ust"; then cat > $TMPC << EOF -#include -#include +#include int main(void) { return 0; } EOF if compile_prog "" "" ; then - LIBS="-lust -lurcu-bp $LIBS" - libs_qga="-lust -lurcu-bp $libs_qga" + if $pkg_config lttng-ust --exists; then + lttng_ust_libs=`$pkg_config --libs lttng-ust` + else + lttng_ust_libs="-llttng-ust" + fi + if $pkg_config liburcu-bp --exists; then + urcu_bp_libs=`$pkg_config --libs liburcu-bp` + else + urcu_bp_libs="-lurcu-bp" + fi + + LIBS="$lttng_ust_libs $urcu_bp_libs $LIBS" + libs_qga="$lttng_ust_libs $urcu_bp_libs $libs_qga" else - error_exit "Trace backend 'ust' missing libust header files" + error_exit "Trace backend 'ust' missing lttng-ust header files" fi fi @@ -3240,6 +3776,17 @@ else esac fi +if test "$coroutine_pool" = ""; then + if test "$coroutine" = "gthread"; then + coroutine_pool=no + else + coroutine_pool=yes + fi +fi +if test "$coroutine" = "gthread" -a "$coroutine_pool" = "yes"; then + error_exit "'gthread' coroutine backend does not support pool (use --disable-coroutine-pool)" +fi + ########################################## # check if we have open_by_handle_at @@ -3333,7 +3880,18 @@ cpuid_h=no cat > $TMPC << EOF #include int main(void) { - return 0; + unsigned a, b, c, d; + int max = __get_cpuid_max(0, 0); + + if (max >= 1) { + __cpuid(1, a, b, c, d); + } + + if (max >= 7) { + __cpuid_count(7, 0, a, b, c, d); + } + + return 0; } EOF if compile_prog "" "" ; then @@ -3345,6 +3903,11 @@ fi int128=no cat > $TMPC << EOF +#if defined(__clang_major__) && defined(__clang_minor__) +# if ((__clang_major__ < 3) || (__clang_major__ == 3) && (__clang_minor__ < 2)) +# error __int128_t does not work in CLANG before 3.2 +# endif +#endif __int128_t a; __uint128_t b; int main (void) { @@ -3380,9 +3943,23 @@ if test "$gcov" = "yes" ; then CFLAGS="-fprofile-arcs -ftest-coverage -g $CFLAGS" LDFLAGS="-fprofile-arcs -ftest-coverage $LDFLAGS" elif test "$debug" = "no" ; then - CFLAGS="-O2 -D_FORTIFY_SOURCE=2 $CFLAGS" + CFLAGS="-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 $CFLAGS" fi +########################################## +# Do we have libnfs +if test "$libnfs" != "no" ; then + if $pkg_config --atleast-version=1.9.3 libnfs; then + libnfs="yes" + libnfs_libs=$($pkg_config --libs libnfs) + LIBS="$LIBS $libnfs_libs" + else + if test "$libnfs" = "yes" ; then + feature_not_found "libnfs" + fi + libnfs="no" + fi +fi # Disable zero malloc errors for official releases unless explicitly told to # enable/disable @@ -3422,6 +3999,7 @@ if test "$mingw32" = "yes" ; then fi qemu_confdir=$sysconfdir$confsuffix +qemu_moddir=$libdir$confsuffix qemu_datadir=$datadir$confsuffix qemu_localedir="$datadir/locale" @@ -3446,8 +4024,11 @@ if test "$softmmu" = yes ; then fi fi if [ "$guest_agent" != "no" ]; then - if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" ] ; then + if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" -o "$mingw32" = "yes" ] ; then tools="qemu-ga\$(EXESUF) $tools" + if [ "$mingw32" = "yes" -a "$guest_agent_with_vss" = "yes" ]; then + tools="qga/vss-win32/qga-vss.dll qga/vss-win32/qga-vss.tlb $tools" + fi guest_agent=yes elif [ "$guest_agent" != yes ]; then guest_agent=no @@ -3475,7 +4056,7 @@ fi if test "$pie" = "no" ; then textseg_addr= case "$cpu" in - arm | hppa | i386 | m68k | ppc | ppc64 | s390* | sparc | sparc64 | x86_64) + arm | hppa | i386 | m68k | ppc | ppc64 | s390* | sparc | sparc64 | x86_64 | x32) textseg_addr=0x60000000 ;; mips) @@ -3509,6 +4090,7 @@ echo "Install prefix $prefix" echo "BIOS directory `eval echo $qemu_datadir`" echo "binary directory `eval echo $bindir`" echo "library directory `eval echo $libdir`" +echo "module directory `eval echo $qemu_moddir`" echo "libexec directory `eval echo $libexecdir`" echo "include directory `eval echo $includedir`" echo "config directory `eval echo $sysconfdir`" @@ -3518,11 +4100,14 @@ echo "Manual directory `eval echo $mandir`" echo "ELF interp prefix $interp_prefix" else echo "local state directory queried at runtime" +echo "Windows SDK $win_sdk" fi echo "Source path $source_path" echo "C compiler $cc" echo "Host C compiler $host_cc" +echo "C++ compiler $cxx" echo "Objective-C compiler $objcc" +echo "ARFLAGS $ARFLAGS" echo "CFLAGS $CFLAGS" echo "QEMU_CFLAGS $QEMU_CFLAGS" echo "LDFLAGS $LDFLAGS" @@ -3532,6 +4117,7 @@ echo "python $python" if test "$slirp" = "yes" ; then echo "smbd $smbd" fi +echo "module support $modules" echo "host CPU $cpu" echo "host big endian $bigendian" echo "target list $target_list" @@ -3541,20 +4127,19 @@ echo "sparse enabled $sparse" echo "strip binaries $strip_opt" echo "profiler $profiler" echo "static build $static" -echo "-Werror enabled $werror" if test "$darwin" = "yes" ; then echo "Cocoa support $cocoa" fi echo "pixman $pixman" echo "SDL support $sdl" echo "GTK support $gtk" +echo "VTE support $vte" echo "curses support $curses" echo "curl support $curl" echo "mingw32 support $mingw32" echo "Audio drivers $audio_drv_list" echo "Block whitelist (rw) $block_drv_rw_whitelist" echo "Block whitelist (ro) $block_drv_ro_whitelist" -echo "Mixer emulation $mixemu" echo "VirtFS support $virtfs" echo "VNC support $vnc" if test "$vnc" = "yes" ; then @@ -3571,11 +4156,10 @@ echo "xen support $xen" echo "brlapi support $brlapi" echo "bluez support $bluez" echo "Documentation $docs" -[ ! -z "$uname_release" ] && \ -echo "uname -r $uname_release" echo "GUEST_BASE $guest_base" echo "PIE $pie" echo "vde support $vde" +echo "netmap support $netmap" echo "Linux AIO support $linux_aio" echo "ATTR/XATTR support $attr" echo "Install blobs $blobs" @@ -3593,18 +4177,31 @@ echo "libcap-ng support $cap_ng" echo "vhost-net support $vhost_net" echo "vhost-scsi support $vhost_scsi" echo "Trace backend $trace_backend" +if test "$trace_backend" = "simple"; then echo "Trace output file $trace_file-" +fi +if test "$spice" = "yes"; then echo "spice support $spice ($spice_protocol_version/$spice_server_version)" +else +echo "spice support $spice" +fi echo "rbd support $rbd" echo "xfsctl support $xfs" echo "nss used $smartcard_nss" echo "libusb $libusb" echo "usb net redir $usb_redir" echo "GLX support $glx" +if test "$libiscsi_version" = "1.4.0"; then +echo "libiscsi support $libiscsi (1.4.0)" +else echo "libiscsi support $libiscsi" +fi +echo "libnfs support $libnfs" echo "build guest agent $guest_agent" +echo "QGA VSS support $guest_agent_with_vss" echo "seccomp support $seccomp" echo "coroutine backend $coroutine" +echo "coroutine pool $coroutine_pool" echo "GlusterFS support $glusterfs" echo "virtio-blk-data-plane $virtio_blk_data_plane" echo "gcov $gcov_tool" @@ -3613,6 +4210,11 @@ echo "TPM support $tpm" echo "libssh2 support $libssh2" echo "TPM passthrough $tpm_passthrough" echo "QOM debugging $qom_cast_debug" +echo "vhdx $vhdx" +echo "Quorum $quorum" +echo "lzo support $lzo" +echo "snappy support $snappy" +echo "NUMA host support $numa" if test "$sdl_too_old" = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -3623,8 +4225,6 @@ config_host_mak="config-host.mak" echo "# Automatically generated by configure - do not modify" >config-all-disas.mak echo "# Automatically generated by configure - do not modify" > $config_host_mak -printf "# Configured with:" >> $config_host_mak -printf " '%s'" "$0" "$@" >> $config_host_mak echo >> $config_host_mak echo all: >> $config_host_mak @@ -3638,6 +4238,7 @@ echo "sysconfdir=$sysconfdir" >> $config_host_mak echo "qemu_confdir=$qemu_confdir" >> $config_host_mak echo "qemu_datadir=$qemu_datadir" >> $config_host_mak echo "qemu_docdir=$qemu_docdir" >> $config_host_mak +echo "qemu_moddir=$qemu_moddir" >> $config_host_mak if test "$mingw32" = "no" ; then echo "qemu_localstatedir=$local_statedir" >> $config_host_mak fi @@ -3649,14 +4250,6 @@ echo "libs_softmmu=$libs_softmmu" >> $config_host_mak echo "ARCH=$ARCH" >> $config_host_mak -case "$cpu" in - arm|i386|x86_64|ppc|aarch64) - # The TCG interpreter currently does not support ld/st optimization. - if test "$tcg_interpreter" = "no" ; then - echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_host_mak - fi - ;; -esac if test "$debug_tcg" = "yes" ; then echo "CONFIG_DEBUG_TCG=y" >> $config_host_mak fi @@ -3677,6 +4270,10 @@ if test "$mingw32" = "yes" ; then version_micro=0 echo "CONFIG_FILEVERSION=$version_major,$version_minor,$version_subminor,$version_micro" >> $config_host_mak echo "CONFIG_PRODUCTVERSION=$version_major,$version_minor,$version_subminor,$version_micro" >> $config_host_mak + if test "$guest_agent_with_vss" = "yes" ; then + echo "CONFIG_QGA_VSS=y" >> $config_host_mak + echo "WIN_SDK=\"$win_sdk\"" >> $config_host_mak + fi else echo "CONFIG_POSIX=y" >> $config_host_mak fi @@ -3716,6 +4313,9 @@ fi if test "$vde" = "yes" ; then echo "CONFIG_VDE=y" >> $config_host_mak fi +if test "$netmap" = "yes" ; then + echo "CONFIG_NETMAP=y" >> $config_host_mak +fi if test "$cap_ng" = "yes" ; then echo "CONFIG_LIBCAP=y" >> $config_host_mak fi @@ -3735,9 +4335,6 @@ if test "$audio_win_int" = "yes" ; then fi echo "CONFIG_BDRV_RW_WHITELIST=$block_drv_rw_whitelist" >> $config_host_mak echo "CONFIG_BDRV_RO_WHITELIST=$block_drv_ro_whitelist" >> $config_host_mak -if test "$mixemu" = "yes" ; then - echo "CONFIG_MIXEMU=y" >> $config_host_mak -fi if test "$vnc" = "yes" ; then echo "CONFIG_VNC=y" >> $config_host_mak fi @@ -3774,8 +4371,15 @@ echo "TARGET_DIRS=$target_list" >> $config_host_mak if [ "$docs" = "yes" ] ; then echo "BUILD_DOCS=yes" >> $config_host_mak fi +if test "$modules" = "yes"; then + # $shacmd can generate a hash started with digit, which the compiler doesn't + # like as an symbol. So prefix it with an underscore + echo "CONFIG_STAMP=_`(echo $qemu_version; echo $pkgversion; cat $0) | $shacmd - | cut -f1 -d\ `" >> $config_host_mak + echo "CONFIG_MODULES=y" >> $config_host_mak +fi if test "$sdl" = "yes" ; then echo "CONFIG_SDL=y" >> $config_host_mak + echo "CONFIG_SDLABI=$sdlabi" >> $config_host_mak echo "SDL_CFLAGS=$sdl_cflags" >> $config_host_mak fi if test "$cocoa" = "yes" ; then @@ -3814,6 +4418,12 @@ fi if test "$dup3" = "yes" ; then echo "CONFIG_DUP3=y" >> $config_host_mak fi +if test "$ppoll" = "yes" ; then + echo "CONFIG_PPOLL=y" >> $config_host_mak +fi +if test "$prctl_pr_set_timerslack" = "yes" ; then + echo "CONFIG_PRCTL_PR_SET_TIMERSLACK=y" >> $config_host_mak +fi if test "$epoll" = "yes" ; then echo "CONFIG_EPOLL=y" >> $config_host_mak fi @@ -3839,8 +4449,9 @@ if test "$bswap_h" = "yes" ; then echo "CONFIG_MACHINE_BSWAP_H=y" >> $config_host_mak fi if test "$curl" = "yes" ; then - echo "CONFIG_CURL=y" >> $config_host_mak + echo "CONFIG_CURL=m" >> $config_host_mak echo "CURL_CFLAGS=$curl_cflags" >> $config_host_mak + echo "CURL_LIBS=$curl_libs" >> $config_host_mak fi if test "$brlapi" = "yes" ; then echo "CONFIG_BRLAPI=y" >> $config_host_mak @@ -3852,7 +4463,11 @@ fi echo "GLIB_CFLAGS=$glib_cflags" >> $config_host_mak if test "$gtk" = "yes" ; then echo "CONFIG_GTK=y" >> $config_host_mak + echo "CONFIG_GTKABI=$gtkabi" >> $config_host_mak echo "GTK_CFLAGS=$gtk_cflags" >> $config_host_mak +fi +if test "$vte" = "yes" ; then + echo "CONFIG_VTE=y" >> $config_host_mak echo "VTE_CFLAGS=$vte_cflags" >> $config_host_mak fi if test "$xen" = "yes" ; then @@ -3911,8 +4526,8 @@ fi if test "$smartcard_nss" = "yes" ; then echo "CONFIG_SMARTCARD_NSS=y" >> $config_host_mak - echo "libcacard_libs=$libcacard_libs" >> $config_host_mak - echo "libcacard_cflags=$libcacard_cflags" >> $config_host_mak + echo "NSS_LIBS=$nss_libs" >> $config_host_mak + echo "NSS_CFLAGS=$nss_cflags" >> $config_host_mak fi if test "$libusb" = "yes" ; then @@ -3928,8 +4543,25 @@ if test "$glx" = "yes" ; then echo "GLX_LIBS=$glx_libs" >> $config_host_mak fi +if test "$lzo" = "yes" ; then + echo "CONFIG_LZO=y" >> $config_host_mak +fi + +if test "$snappy" = "yes" ; then + echo "CONFIG_SNAPPY=y" >> $config_host_mak +fi + if test "$libiscsi" = "yes" ; then - echo "CONFIG_LIBISCSI=y" >> $config_host_mak + echo "CONFIG_LIBISCSI=m" >> $config_host_mak + if test "$libiscsi_version" = "1.4.0"; then + echo "CONFIG_LIBISCSI_1_4=y" >> $config_host_mak + fi + echo "LIBISCSI_CFLAGS=$libiscsi_cflags" >> $config_host_mak + echo "LIBISCSI_LIBS=$libiscsi_libs" >> $config_host_mak +fi + +if test "$libnfs" = "yes" ; then + echo "CONFIG_LIBNFS=y" >> $config_host_mak fi if test "$seccomp" = "yes"; then @@ -3941,8 +4573,6 @@ if [ "$bsd" = "yes" ] ; then echo "CONFIG_BSD=y" >> $config_host_mak fi -echo "CONFIG_UNAME_RELEASE=\"$uname_release\"" >> $config_host_mak - if test "$zero_malloc" = "yes" ; then echo "CONFIG_ZERO_MALLOC=y" >> $config_host_mak fi @@ -3950,10 +4580,17 @@ if test "$qom_cast_debug" = "yes" ; then echo "CONFIG_QOM_CAST_DEBUG=y" >> $config_host_mak fi if test "$rbd" = "yes" ; then - echo "CONFIG_RBD=y" >> $config_host_mak + echo "CONFIG_RBD=m" >> $config_host_mak + echo "RBD_CFLAGS=$rbd_cflags" >> $config_host_mak + echo "RBD_LIBS=$rbd_libs" >> $config_host_mak fi echo "CONFIG_COROUTINE_BACKEND=$coroutine" >> $config_host_mak +if test "$coroutine_pool" = "yes" ; then + echo "CONFIG_COROUTINE_POOL=1" >> $config_host_mak +else + echo "CONFIG_COROUTINE_POOL=0" >> $config_host_mak +fi if test "$open_by_handle_at" = "yes" ; then echo "CONFIG_OPEN_BY_HANDLE=y" >> $config_host_mak @@ -3988,40 +4625,43 @@ if test "$getauxval" = "yes" ; then fi if test "$glusterfs" = "yes" ; then - echo "CONFIG_GLUSTERFS=y" >> $config_host_mak + echo "CONFIG_GLUSTERFS=m" >> $config_host_mak + echo "GLUSTERFS_CFLAGS=$glusterfs_cflags" >> $config_host_mak + echo "GLUSTERFS_LIBS=$glusterfs_libs" >> $config_host_mak fi if test "$glusterfs_discard" = "yes" ; then echo "CONFIG_GLUSTERFS_DISCARD=y" >> $config_host_mak fi +if test "$glusterfs_zerofill" = "yes" ; then + echo "CONFIG_GLUSTERFS_ZEROFILL=y" >> $config_host_mak +fi + if test "$libssh2" = "yes" ; then - echo "CONFIG_LIBSSH2=y" >> $config_host_mak + echo "CONFIG_LIBSSH2=m" >> $config_host_mak + echo "LIBSSH2_CFLAGS=$libssh2_cflags" >> $config_host_mak + echo "LIBSSH2_LIBS=$libssh2_libs" >> $config_host_mak +fi + +if test "$quorum" = "yes" ; then + echo "CONFIG_QUORUM=y" >> $config_host_mak fi if test "$virtio_blk_data_plane" = "yes" ; then echo 'CONFIG_VIRTIO_BLK_DATA_PLANE=$(CONFIG_VIRTIO)' >> $config_host_mak fi +if test "$vhdx" = "yes" ; then + echo "CONFIG_VHDX=y" >> $config_host_mak +fi + # USB host support -case "$usb" in -linux) - echo "HOST_USB=linux legacy" >> $config_host_mak -;; -bsd) - echo "HOST_USB=bsd" >> $config_host_mak -;; -libusb) - if test "$linux" = "yes"; then - echo "HOST_USB=libusb linux legacy" >> $config_host_mak - else - echo "HOST_USB=libusb legacy" >> $config_host_mak - fi -;; -*) +if test "$libusb" = "yes"; then + echo "HOST_USB=libusb legacy" >> $config_host_mak +else echo "HOST_USB=stub" >> $config_host_mak -;; -esac +fi # TPM passthrough support? if test "$tpm" = "yes"; then @@ -4061,7 +4701,7 @@ if test "$trace_backend" = "ftrace"; then echo "CONFIG_TRACE_FTRACE=y" >> $config_host_mak trace_default=no else - feature_not_found "ftrace(trace backend)" + feature_not_found "ftrace(trace backend)" "ftrace requires Linux" fi fi echo "CONFIG_TRACE_FILE=$trace_file" >> $config_host_mak @@ -4073,13 +4713,23 @@ if test "$rdma" = "yes" ; then echo "CONFIG_RDMA=y" >> $config_host_mak fi +# Hold two types of flag: +# CONFIG_THREAD_SETNAME_BYTHREAD - we've got a way of setting the name on +# a thread we have a handle to +# CONFIG_PTHREAD_SETNAME_NP - A way of doing it on a particular +# platform +if test "$pthread_setname_np" = "yes" ; then + echo "CONFIG_THREAD_SETNAME_BYTHREAD=y" >> $config_host_mak + echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak +fi + if test "$tcg_interpreter" = "yes"; then QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" elif test "$ARCH" = "sparc64" ; then QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/sparc $QEMU_INCLUDES" elif test "$ARCH" = "s390x" ; then QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/s390 $QEMU_INCLUDES" -elif test "$ARCH" = "x86_64" ; then +elif test "$ARCH" = "x86_64" -o "$ARCH" = "x32" ; then QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/i386 $QEMU_INCLUDES" else QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/\$(ARCH) $QEMU_INCLUDES" @@ -4101,10 +4751,15 @@ else fi echo "PYTHON=$python" >> $config_host_mak echo "CC=$cc" >> $config_host_mak +if $iasl -h > /dev/null 2>&1; then + echo "IASL=$iasl" >> $config_host_mak +fi echo "CC_I386=$cc_i386" >> $config_host_mak echo "HOST_CC=$host_cc" >> $config_host_mak +echo "CXX=$cxx" >> $config_host_mak echo "OBJCC=$objcc" >> $config_host_mak echo "AR=$ar" >> $config_host_mak +echo "ARFLAGS=$ARFLAGS" >> $config_host_mak echo "AS=$as" >> $config_host_mak echo "CPP=$cpp" >> $config_host_mak echo "OBJCOPY=$objcopy" >> $config_host_mak @@ -4112,6 +4767,7 @@ echo "LD=$ld" >> $config_host_mak echo "WINDRES=$windres" >> $config_host_mak echo "LIBTOOL=$libtool" >> $config_host_mak echo "CFLAGS=$CFLAGS" >> $config_host_mak +echo "CFLAGS_NOPIE=$CFLAGS_NOPIE" >> $config_host_mak echo "QEMU_CFLAGS=$QEMU_CFLAGS" >> $config_host_mak echo "QEMU_INCLUDES=$QEMU_INCLUDES" >> $config_host_mak if test "$sparse" = "yes" ; then @@ -4125,10 +4781,13 @@ else echo "AUTOCONF_HOST := " >> $config_host_mak fi echo "LDFLAGS=$LDFLAGS" >> $config_host_mak +echo "LDFLAGS_NOPIE=$LDFLAGS_NOPIE" >> $config_host_mak echo "LIBTOOLFLAGS=$LIBTOOLFLAGS" >> $config_host_mak echo "LIBS+=$LIBS" >> $config_host_mak echo "LIBS_TOOLS+=$libs_tools" >> $config_host_mak echo "EXESUF=$EXESUF" >> $config_host_mak +echo "DSOSUF=$DSOSUF" >> $config_host_mak +echo "LDFLAGS_SHARED=$LDFLAGS_SHARED" >> $config_host_mak echo "LIBS_QGA+=$libs_qga" >> $config_host_mak echo "POD2MAN=$POD2MAN" >> $config_host_mak echo "TRANSLATE_OPT_CFLAGS=$TRANSLATE_OPT_CFLAGS" >> $config_host_mak @@ -4141,7 +4800,7 @@ fi if test "$linux" = "yes" ; then mkdir -p linux-headers case "$cpu" in - i386|x86_64) + i386|x86_64|x32) linux_arch=x86 ;; ppcemb|ppc|ppc64) @@ -4227,6 +4886,11 @@ case "$target_name" in bflt="yes" gdb_xml_files="arm-core.xml arm-vfp.xml arm-vfp3.xml arm-neon.xml" ;; + aarch64) + TARGET_BASE_ARCH=arm + bflt="yes" + gdb_xml_files="aarch64-core.xml aarch64-fpu.xml" + ;; cris) ;; lm32) @@ -4337,7 +5001,7 @@ case "$target_name" in *) esac case "$target_name" in - arm|i386|x86_64|ppcemb|ppc|ppc64|s390x) + aarch64|arm|i386|x86_64|ppcemb|ppc|ppc64|s390x) # Make sure the target and host cpus are compatible if test "$kvm" = "yes" -a "$target_softmmu" = "yes" -a \ \( "$target_name" = "$cpu" -o \ @@ -4395,9 +5059,19 @@ for i in $ARCH $TARGET_BASE_ARCH ; do echo "CONFIG_ALPHA_DIS=y" >> $config_target_mak echo "CONFIG_ALPHA_DIS=y" >> config-all-disas.mak ;; + aarch64) + if test -n "${cxx}"; then + echo "CONFIG_ARM_A64_DIS=y" >> $config_target_mak + echo "CONFIG_ARM_A64_DIS=y" >> config-all-disas.mak + fi + ;; arm) echo "CONFIG_ARM_DIS=y" >> $config_target_mak echo "CONFIG_ARM_DIS=y" >> config-all-disas.mak + if test -n "${cxx}"; then + echo "CONFIG_ARM_A64_DIS=y" >> $config_target_mak + echo "CONFIG_ARM_A64_DIS=y" >> config-all-disas.mak + fi ;; cris) echo "CONFIG_CRIS_DIS=y" >> $config_target_mak @@ -4407,7 +5081,7 @@ for i in $ARCH $TARGET_BASE_ARCH ; do echo "CONFIG_HPPA_DIS=y" >> $config_target_mak echo "CONFIG_HPPA_DIS=y" >> config-all-disas.mak ;; - i386|x86_64) + i386|x86_64|x32) echo "CONFIG_I386_DIS=y" >> $config_target_mak echo "CONFIG_I386_DIS=y" >> config-all-disas.mak ;; @@ -4506,8 +5180,13 @@ if [ "$dtc_internal" = "yes" ]; then echo "config-host.h: subdir-dtc" >> $config_host_mak fi +if test "$numa" = "yes"; then + echo "CONFIG_NUMA=y" >> $config_host_mak +fi + # build tree in object directory in case the source is not in the current directory -DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa" +DIRS="tests tests/tcg tests/tcg/cris tests/tcg/lm32 tests/libqos tests/qapi-schema tests/tcg/xtensa tests/qemu-iotests" +DIRS="$DIRS fsdev" DIRS="$DIRS pc-bios/optionrom pc-bios/spapr-rtas pc-bios/s390-ccw" DIRS="$DIRS roms/seabios roms/vgabios" DIRS="$DIRS qapi-generated" @@ -4530,9 +5209,13 @@ for bios_file in \ do FILES="$FILES pc-bios/`basename $bios_file`" done +for test_file in `find $source_path/tests/acpi-test-data -type f` +do + FILES="$FILES tests/acpi-test-data`echo $test_file | sed -e 's/.*acpi-test-data//'`" +done mkdir -p $DIRS for f in $FILES ; do - if [ -e "$source_path/$f" ] && [ "$source_path" != `pwd` ]; then + if [ -e "$source_path/$f" ] && [ "$pwd_is_source_path" != "y" ]; then symlink "$source_path/$f" "$f" fi done @@ -4547,10 +5230,24 @@ for rom in seabios vgabios ; do echo "BCC=bcc" >> $config_mak echo "CPP=$cpp" >> $config_mak echo "OBJCOPY=objcopy" >> $config_mak - echo "IASL=iasl" >> $config_mak + echo "IASL=$iasl" >> $config_mak echo "LD=$ld" >> $config_mak done if test "$docs" = "yes" ; then mkdir -p QMP fi + +# Save the configure command line for later reuse. +cat <config.status +#!/bin/sh +# Generated by configure. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. +EOD +printf "exec" >>config.status +printf " '%s'" "$0" "$@" >>config.status +echo >>config.status +chmod +x config.status + diff --git a/coroutine-gthread.c b/coroutine-gthread.c index d3e5b991f7..a61efe01dc 100644 --- a/coroutine-gthread.c +++ b/coroutine-gthread.c @@ -115,14 +115,11 @@ static inline GThread *create_thread(GThreadFunc func, gpointer data) static void __attribute__((constructor)) coroutine_init(void) { - if (!g_thread_supported()) { #if !GLIB_CHECK_VERSION(2, 31, 0) + if (!g_thread_supported()) { g_thread_init(NULL); -#else - fprintf(stderr, "glib threading failed to initialize.\n"); - exit(1); -#endif } +#endif init_coroutine_cond(); } diff --git a/cpu-exec.c b/cpu-exec.c index 301be28bf7..38e5f02a30 100644 --- a/cpu-exec.c +++ b/cpu-exec.c @@ -23,29 +23,22 @@ #include "qemu/atomic.h" #include "sysemu/qtest.h" -bool qemu_cpu_has_work(CPUState *cpu) +void cpu_loop_exit(CPUState *cpu) { - return cpu_has_work(cpu); -} - -void cpu_loop_exit(CPUArchState *env) -{ - CPUState *cpu = ENV_GET_CPU(env); - cpu->current_tb = NULL; - siglongjmp(env->jmp_env, 1); + siglongjmp(cpu->jmp_env, 1); } /* exit the current TB from a signal handler. The host registers are restored in a state compatible with the CPU emulator */ #if defined(CONFIG_SOFTMMU) -void cpu_resume_from_signal(CPUArchState *env, void *puc) +void cpu_resume_from_signal(CPUState *cpu, void *puc) { /* XXX: restore cpu registers saved in host registers */ - env->exception_index = -1; - siglongjmp(env->jmp_env, 1); + cpu->exception_index = -1; + siglongjmp(cpu->jmp_env, 1); } #endif @@ -53,7 +46,25 @@ void cpu_resume_from_signal(CPUArchState *env, void *puc) static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr) { CPUArchState *env = cpu->env_ptr; - tcg_target_ulong next_tb = tcg_qemu_tb_exec(env, tb_ptr); + uintptr_t next_tb; + +#if defined(DEBUG_DISAS) + if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) { +#if defined(TARGET_I386) + log_cpu_state(cpu, CPU_DUMP_CCOP); +#elif defined(TARGET_M68K) + /* ??? Should not modify env state for dumping. */ + cpu_m68k_flush_flags(env, env->cc_op); + env->cc_op = CC_OP_FLAGS; + env->sr = (env->sr & 0xffe0) | env->cc_dest | (env->cc_x << 4); + log_cpu_state(cpu, 0); +#else + log_cpu_state(cpu, 0); +#endif + } +#endif /* DEBUG_DISAS */ + + next_tb = tcg_qemu_tb_exec(env, tb_ptr); if ((next_tb & TB_EXIT_MASK) > TB_EXIT_IDX1) { /* We didn't start executing this TB (eg because the instruction * counter hit zero); we must restore the guest PC to the address @@ -90,7 +101,7 @@ static void cpu_exec_nocache(CPUArchState *env, int max_cycles, if (max_cycles > CF_COUNT_MASK) max_cycles = CF_COUNT_MASK; - tb = tb_gen_code(env, orig_tb->pc, orig_tb->cs_base, orig_tb->flags, + tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags, max_cycles); cpu->current_tb = tb; /* execute the generated code */ @@ -105,6 +116,7 @@ static TranslationBlock *tb_find_slow(CPUArchState *env, target_ulong cs_base, uint64_t flags) { + CPUState *cpu = ENV_GET_CPU(env); TranslationBlock *tb, **ptb1; unsigned int h; tb_page_addr_t phys_pc, phys_page1; @@ -142,7 +154,7 @@ static TranslationBlock *tb_find_slow(CPUArchState *env, } not_found: /* if no translated code available, then translate it now */ - tb = tb_gen_code(env, pc, cs_base, flags, 0); + tb = tb_gen_code(cpu, pc, cs_base, flags, 0); found: /* Move the last found TB to the head of the list */ @@ -152,12 +164,13 @@ static TranslationBlock *tb_find_slow(CPUArchState *env, tcg_ctx.tb_ctx.tb_phys_hash[h] = tb; } /* we add the TB in the virtual pc hash table */ - env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb; + cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)] = tb; return tb; } static inline TranslationBlock *tb_find_fast(CPUArchState *env) { + CPUState *cpu = ENV_GET_CPU(env); TranslationBlock *tb; target_ulong cs_base, pc; int flags; @@ -166,7 +179,7 @@ static inline TranslationBlock *tb_find_fast(CPUArchState *env) always be the same before a given translated block is executed. */ cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags); - tb = env->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]; + tb = cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]; if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base || tb->flags != flags)) { tb = tb_find_slow(env, pc, cs_base, flags); @@ -183,10 +196,11 @@ void cpu_set_debug_excp_handler(CPUDebugExcpHandler *handler) static void cpu_handle_debug_exception(CPUArchState *env) { + CPUState *cpu = ENV_GET_CPU(env); CPUWatchpoint *wp; - if (!env->watchpoint_hit) { - QTAILQ_FOREACH(wp, &env->watchpoints, entry) { + if (!cpu->watchpoint_hit) { + QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { wp->flags &= ~BP_WATCHPOINT_HIT; } } @@ -205,11 +219,16 @@ int cpu_exec(CPUArchState *env) #if !(defined(CONFIG_USER_ONLY) && \ (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X))) CPUClass *cc = CPU_GET_CLASS(cpu); +#endif +#ifdef TARGET_I386 + X86CPU *x86_cpu = X86_CPU(cpu); #endif int ret, interrupt_request; TranslationBlock *tb; uint8_t *tc_ptr; - tcg_target_ulong next_tb; + uintptr_t next_tb; + /* This must be volatile so it is not trashed by longjmp() */ + volatile bool have_tb_lock = false; if (cpu->halted) { if (!cpu_has_work(cpu)) { @@ -262,16 +281,16 @@ int cpu_exec(CPUArchState *env) #else #error unsupported target CPU #endif - env->exception_index = -1; + cpu->exception_index = -1; /* prepare setjmp context for exception handling */ for(;;) { - if (sigsetjmp(env->jmp_env, 0) == 0) { + if (sigsetjmp(cpu->jmp_env, 0) == 0) { /* if an exception is pending, we execute it here */ - if (env->exception_index >= 0) { - if (env->exception_index >= EXCP_INTERRUPT) { + if (cpu->exception_index >= 0) { + if (cpu->exception_index >= EXCP_INTERRUPT) { /* exit request from the cpu execution loop */ - ret = env->exception_index; + ret = cpu->exception_index; if (ret == EXCP_DEBUG) { cpu_handle_debug_exception(env); } @@ -284,11 +303,11 @@ int cpu_exec(CPUArchState *env) #if defined(TARGET_I386) cc->do_interrupt(cpu); #endif - ret = env->exception_index; + ret = cpu->exception_index; break; #else cc->do_interrupt(cpu); - env->exception_index = -1; + cpu->exception_index = -1; #endif } } @@ -303,8 +322,8 @@ int cpu_exec(CPUArchState *env) } if (interrupt_request & CPU_INTERRUPT_DEBUG) { cpu->interrupt_request &= ~CPU_INTERRUPT_DEBUG; - env->exception_index = EXCP_DEBUG; - cpu_loop_exit(env); + cpu->exception_index = EXCP_DEBUG; + cpu_loop_exit(cpu); } #if defined(TARGET_ARM) || defined(TARGET_SPARC) || defined(TARGET_MIPS) || \ defined(TARGET_PPC) || defined(TARGET_ALPHA) || defined(TARGET_CRIS) || \ @@ -312,32 +331,38 @@ int cpu_exec(CPUArchState *env) if (interrupt_request & CPU_INTERRUPT_HALT) { cpu->interrupt_request &= ~CPU_INTERRUPT_HALT; cpu->halted = 1; - env->exception_index = EXCP_HLT; - cpu_loop_exit(env); + cpu->exception_index = EXCP_HLT; + cpu_loop_exit(cpu); + } +#endif +#if defined(TARGET_I386) + if (interrupt_request & CPU_INTERRUPT_INIT) { + cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0); + do_cpu_init(x86_cpu); + cpu->exception_index = EXCP_HALTED; + cpu_loop_exit(cpu); + } +#else + if (interrupt_request & CPU_INTERRUPT_RESET) { + cpu_reset(cpu); } #endif #if defined(TARGET_I386) #if !defined(CONFIG_USER_ONLY) if (interrupt_request & CPU_INTERRUPT_POLL) { cpu->interrupt_request &= ~CPU_INTERRUPT_POLL; - apic_poll_irq(env->apic_state); + apic_poll_irq(x86_cpu->apic_state); } #endif - if (interrupt_request & CPU_INTERRUPT_INIT) { - cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, - 0); - do_cpu_init(x86_env_get_cpu(env)); - env->exception_index = EXCP_HALTED; - cpu_loop_exit(env); - } else if (interrupt_request & CPU_INTERRUPT_SIPI) { - do_cpu_sipi(x86_env_get_cpu(env)); + if (interrupt_request & CPU_INTERRUPT_SIPI) { + do_cpu_sipi(x86_cpu); } else if (env->hflags2 & HF2_GIF_MASK) { if ((interrupt_request & CPU_INTERRUPT_SMI) && !(env->hflags & HF_SMM_MASK)) { cpu_svm_check_intercept_param(env, SVM_EXIT_SMI, 0); cpu->interrupt_request &= ~CPU_INTERRUPT_SMI; - do_smm_enter(x86_env_get_cpu(env)); + do_smm_enter(x86_cpu); next_tb = 0; } else if ((interrupt_request & CPU_INTERRUPT_NMI) && !(env->hflags2 & HF2_NMI_MASK)) { @@ -374,7 +399,10 @@ int cpu_exec(CPUArchState *env) /* FIXME: this should respect TPR */ cpu_svm_check_intercept_param(env, SVM_EXIT_VINTR, 0); - intno = ldl_phys(env->vm_vmcb + offsetof(struct vmcb, control.int_vector)); + intno = ldl_phys(cpu->as, + env->vm_vmcb + + offsetof(struct vmcb, + control.int_vector)); qemu_log_mask(CPU_LOG_TB_IN_ASM, "Servicing virtual hardware INT=0x%02x\n", intno); do_interrupt_x86_hardirq(env, intno, 1); cpu->interrupt_request &= ~CPU_INTERRUPT_VIRQ; @@ -383,9 +411,6 @@ int cpu_exec(CPUArchState *env) } } #elif defined(TARGET_PPC) - if ((interrupt_request & CPU_INTERRUPT_RESET)) { - cpu_reset(cpu); - } if (interrupt_request & CPU_INTERRUPT_HARD) { ppc_hw_interrupt(env); if (env->pending_interrupts == 0) { @@ -396,7 +421,7 @@ int cpu_exec(CPUArchState *env) #elif defined(TARGET_LM32) if ((interrupt_request & CPU_INTERRUPT_HARD) && (env->ie & IE_IE)) { - env->exception_index = EXCP_IRQ; + cpu->exception_index = EXCP_IRQ; cc->do_interrupt(cpu); next_tb = 0; } @@ -405,7 +430,7 @@ int cpu_exec(CPUArchState *env) && (env->sregs[SR_MSR] & MSR_IE) && !(env->sregs[SR_MSR] & (MSR_EIP | MSR_BIP)) && !(env->iflags & (D_FLAG | IMM_FLAG))) { - env->exception_index = EXCP_IRQ; + cpu->exception_index = EXCP_IRQ; cc->do_interrupt(cpu); next_tb = 0; } @@ -413,7 +438,7 @@ int cpu_exec(CPUArchState *env) if ((interrupt_request & CPU_INTERRUPT_HARD) && cpu_mips_hw_interrupts_pending(env)) { /* Raise it */ - env->exception_index = EXCP_EXT_INTERRUPT; + cpu->exception_index = EXCP_EXT_INTERRUPT; env->error_code = 0; cc->do_interrupt(cpu); next_tb = 0; @@ -430,7 +455,7 @@ int cpu_exec(CPUArchState *env) idx = EXCP_TICK; } if (idx >= 0) { - env->exception_index = idx; + cpu->exception_index = idx; cc->do_interrupt(cpu); next_tb = 0; } @@ -445,7 +470,7 @@ int cpu_exec(CPUArchState *env) if (((type == TT_EXTINT) && cpu_pil_allowed(env, pil)) || type != TT_EXTINT) { - env->exception_index = env->interrupt_index; + cpu->exception_index = env->interrupt_index; cc->do_interrupt(cpu); next_tb = 0; } @@ -453,8 +478,8 @@ int cpu_exec(CPUArchState *env) } #elif defined(TARGET_ARM) if (interrupt_request & CPU_INTERRUPT_FIQ - && !(env->uncached_cpsr & CPSR_F)) { - env->exception_index = EXCP_FIQ; + && !(env->daif & PSTATE_F)) { + cpu->exception_index = EXCP_FIQ; cc->do_interrupt(cpu); next_tb = 0; } @@ -469,15 +494,15 @@ int cpu_exec(CPUArchState *env) pc contains a magic address. */ if (interrupt_request & CPU_INTERRUPT_HARD && ((IS_M(env) && env->regs[15] < 0xfffffff0) - || !(env->uncached_cpsr & CPSR_I))) { - env->exception_index = EXCP_IRQ; + || !(env->daif & PSTATE_I))) { + cpu->exception_index = EXCP_IRQ; cc->do_interrupt(cpu); next_tb = 0; } #elif defined(TARGET_UNICORE32) if (interrupt_request & CPU_INTERRUPT_HARD && !(env->uncached_asr & ASR_I)) { - env->exception_index = UC32_EXCP_INTR; + cpu->exception_index = UC32_EXCP_INTR; cc->do_interrupt(cpu); next_tb = 0; } @@ -512,7 +537,7 @@ int cpu_exec(CPUArchState *env) } } if (idx >= 0) { - env->exception_index = idx; + cpu->exception_index = idx; env->error_code = 0; cc->do_interrupt(cpu); next_tb = 0; @@ -522,7 +547,7 @@ int cpu_exec(CPUArchState *env) if (interrupt_request & CPU_INTERRUPT_HARD && (env->pregs[PR_CCS] & I_FLAG) && !env->locked_irq) { - env->exception_index = EXCP_IRQ; + cpu->exception_index = EXCP_IRQ; cc->do_interrupt(cpu); next_tb = 0; } @@ -534,7 +559,7 @@ int cpu_exec(CPUArchState *env) m_flag_archval = M_FLAG_V32; } if ((env->pregs[PR_CCS] & m_flag_archval)) { - env->exception_index = EXCP_NMI; + cpu->exception_index = EXCP_NMI; cc->do_interrupt(cpu); next_tb = 0; } @@ -548,7 +573,7 @@ int cpu_exec(CPUArchState *env) hardware doesn't rely on this, so we provide/save the vector when the interrupt is first signalled. */ - env->exception_index = env->pending_vector; + cpu->exception_index = env->pending_vector; do_interrupt_m68k_hardirq(env); next_tb = 0; } @@ -560,7 +585,7 @@ int cpu_exec(CPUArchState *env) } #elif defined(TARGET_XTENSA) if (interrupt_request & CPU_INTERRUPT_HARD) { - env->exception_index = EXC_IRQ; + cpu->exception_index = EXC_IRQ; cc->do_interrupt(cpu); next_tb = 0; } @@ -576,26 +601,11 @@ int cpu_exec(CPUArchState *env) } if (unlikely(cpu->exit_request)) { cpu->exit_request = 0; - env->exception_index = EXCP_INTERRUPT; - cpu_loop_exit(env); + cpu->exception_index = EXCP_INTERRUPT; + cpu_loop_exit(cpu); } -#if defined(DEBUG_DISAS) - if (qemu_loglevel_mask(CPU_LOG_TB_CPU)) { - /* restore flags in standard format */ -#if defined(TARGET_I386) - log_cpu_state(cpu, CPU_DUMP_CCOP); -#elif defined(TARGET_M68K) - cpu_m68k_flush_flags(env, env->cc_op); - env->cc_op = CC_OP_FLAGS; - env->sr = (env->sr & 0xffe0) - | env->cc_dest | (env->cc_x << 4); - log_cpu_state(cpu, 0); -#else - log_cpu_state(cpu, 0); -#endif - } -#endif /* DEBUG_DISAS */ spin_lock(&tcg_ctx.tb_ctx.tb_lock); + have_tb_lock = true; tb = tb_find_fast(env); /* Note: we do it here to avoid a gcc bug on Mac OS X when doing it in tb_find_slow */ @@ -617,6 +627,7 @@ int cpu_exec(CPUArchState *env) tb_add_jump((TranslationBlock *)(next_tb & ~TB_EXIT_MASK), next_tb & TB_EXIT_MASK, tb); } + have_tb_lock = false; spin_unlock(&tcg_ctx.tb_ctx.tb_lock); /* cpu_interrupt might be called while translating the @@ -646,25 +657,25 @@ int cpu_exec(CPUArchState *env) /* Instruction counter expired. */ int insns_left; tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK); - insns_left = env->icount_decr.u32; - if (env->icount_extra && insns_left >= 0) { + insns_left = cpu->icount_decr.u32; + if (cpu->icount_extra && insns_left >= 0) { /* Refill decrementer and continue execution. */ - env->icount_extra += insns_left; - if (env->icount_extra > 0xffff) { + cpu->icount_extra += insns_left; + if (cpu->icount_extra > 0xffff) { insns_left = 0xffff; } else { - insns_left = env->icount_extra; + insns_left = cpu->icount_extra; } - env->icount_extra -= insns_left; - env->icount_decr.u16.low = insns_left; + cpu->icount_extra -= insns_left; + cpu->icount_decr.u16.low = insns_left; } else { if (insns_left > 0) { /* Execute remaining instructions. */ cpu_exec_nocache(env, insns_left, tb); } - env->exception_index = EXCP_INTERRUPT; + cpu->exception_index = EXCP_INTERRUPT; next_tb = 0; - cpu_loop_exit(env); + cpu_loop_exit(cpu); } break; } @@ -681,6 +692,17 @@ int cpu_exec(CPUArchState *env) * local variables as longjmp is marked 'noreturn'. */ cpu = current_cpu; env = cpu->env_ptr; +#if !(defined(CONFIG_USER_ONLY) && \ + (defined(TARGET_M68K) || defined(TARGET_PPC) || defined(TARGET_S390X))) + cc = CPU_GET_CLASS(cpu); +#endif +#ifdef TARGET_I386 + x86_cpu = X86_CPU(cpu); +#endif + if (have_tb_lock) { + spin_unlock(&tcg_ctx.tb_ctx.tb_lock); + have_tb_lock = false; + } } } /* for(;;) */ diff --git a/cpus.c b/cpus.c index 0f65e763f2..ce668b7211 100644 --- a/cpus.c +++ b/cpus.c @@ -37,6 +37,7 @@ #include "sysemu/qtest.h" #include "qemu/main-loop.h" #include "qemu/bitmap.h" +#include "qemu/seqlock.h" #ifndef _WIN32 #include "qemu/compatfd.h" @@ -62,15 +63,20 @@ static CPUState *next_cpu; +bool cpu_is_stopped(CPUState *cpu) +{ + return cpu->stopped || !runstate_is_running(); +} + static bool cpu_thread_is_idle(CPUState *cpu) { if (cpu->stop || cpu->queued_work_first) { return false; } - if (cpu->stopped || !runstate_is_running()) { + if (cpu_is_stopped(cpu)) { return true; } - if (!cpu->halted || qemu_cpu_has_work(cpu) || + if (!cpu->halted || cpu_has_work(cpu) || kvm_halt_in_kernel()) { return false; } @@ -81,7 +87,7 @@ static bool all_cpu_threads_idle(void) { CPUState *cpu; - for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { if (!cpu_thread_is_idle(cpu)) { return false; } @@ -92,21 +98,32 @@ static bool all_cpu_threads_idle(void) /***********************************************************/ /* guest cycle counter */ +/* Protected by TimersState seqlock */ + +/* Compensate for varying guest execution speed. */ +static int64_t qemu_icount_bias; +static int64_t vm_clock_warp_start; /* Conversion factor from emulated instructions to virtual clock ticks. */ static int icount_time_shift; /* Arbitrarily pick 1MIPS as the minimum allowable speed. */ #define MAX_ICOUNT_SHIFT 10 -/* Compensate for varying guest execution speed. */ -static int64_t qemu_icount_bias; + +/* Only written by TCG thread */ +static int64_t qemu_icount; + static QEMUTimer *icount_rt_timer; static QEMUTimer *icount_vm_timer; static QEMUTimer *icount_warp_timer; -static int64_t vm_clock_warp_start; -static int64_t qemu_icount; typedef struct TimersState { + /* Protected by BQL. */ int64_t cpu_ticks_prev; int64_t cpu_ticks_offset; + + /* cpu_clock_offset can be read out of BQL, so protect it with + * this lock. + */ + QemuSeqLock vm_clock_seqlock; int64_t cpu_clock_offset; int32_t cpu_ticks_enabled; int64_t dummy; @@ -115,74 +132,115 @@ typedef struct TimersState { static TimersState timers_state; /* Return the virtual CPU time, based on the instruction counter. */ -int64_t cpu_get_icount(void) +static int64_t cpu_get_icount_locked(void) { int64_t icount; CPUState *cpu = current_cpu; icount = qemu_icount; if (cpu) { - CPUArchState *env = cpu->env_ptr; - if (!can_do_io(env)) { + if (!cpu_can_do_io(cpu)) { fprintf(stderr, "Bad clock read\n"); } - icount -= (env->icount_decr.u16.low + env->icount_extra); + icount -= (cpu->icount_decr.u16.low + cpu->icount_extra); } return qemu_icount_bias + (icount << icount_time_shift); } +int64_t cpu_get_icount(void) +{ + int64_t icount; + unsigned start; + + do { + start = seqlock_read_begin(&timers_state.vm_clock_seqlock); + icount = cpu_get_icount_locked(); + } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start)); + + return icount; +} + /* return the host CPU cycle counter and handle stop/restart */ +/* Caller must hold the BQL */ int64_t cpu_get_ticks(void) { + int64_t ticks; + if (use_icount) { return cpu_get_icount(); } - if (!timers_state.cpu_ticks_enabled) { - return timers_state.cpu_ticks_offset; - } else { - int64_t ticks; - ticks = cpu_get_real_ticks(); - if (timers_state.cpu_ticks_prev > ticks) { - /* Note: non increasing ticks may happen if the host uses - software suspend */ - timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks; - } - timers_state.cpu_ticks_prev = ticks; - return ticks + timers_state.cpu_ticks_offset; + + ticks = timers_state.cpu_ticks_offset; + if (timers_state.cpu_ticks_enabled) { + ticks += cpu_get_real_ticks(); + } + + if (timers_state.cpu_ticks_prev > ticks) { + /* Note: non increasing ticks may happen if the host uses + software suspend */ + timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks; + ticks = timers_state.cpu_ticks_prev; } + + timers_state.cpu_ticks_prev = ticks; + return ticks; +} + +static int64_t cpu_get_clock_locked(void) +{ + int64_t ticks; + + ticks = timers_state.cpu_clock_offset; + if (timers_state.cpu_ticks_enabled) { + ticks += get_clock(); + } + + return ticks; } /* return the host CPU monotonic timer and handle stop/restart */ int64_t cpu_get_clock(void) { int64_t ti; - if (!timers_state.cpu_ticks_enabled) { - return timers_state.cpu_clock_offset; - } else { - ti = get_clock(); - return ti + timers_state.cpu_clock_offset; - } + unsigned start; + + do { + start = seqlock_read_begin(&timers_state.vm_clock_seqlock); + ti = cpu_get_clock_locked(); + } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start)); + + return ti; } -/* enable cpu_get_ticks() */ +/* enable cpu_get_ticks() + * Caller must hold BQL which server as mutex for vm_clock_seqlock. + */ void cpu_enable_ticks(void) { + /* Here, the really thing protected by seqlock is cpu_clock_offset. */ + seqlock_write_lock(&timers_state.vm_clock_seqlock); if (!timers_state.cpu_ticks_enabled) { timers_state.cpu_ticks_offset -= cpu_get_real_ticks(); timers_state.cpu_clock_offset -= get_clock(); timers_state.cpu_ticks_enabled = 1; } + seqlock_write_unlock(&timers_state.vm_clock_seqlock); } /* disable cpu_get_ticks() : the clock is stopped. You must not call - cpu_get_ticks() after that. */ + * cpu_get_ticks() after that. + * Caller must hold BQL which server as mutex for vm_clock_seqlock. + */ void cpu_disable_ticks(void) { + /* Here, the really thing protected by seqlock is cpu_clock_offset. */ + seqlock_write_lock(&timers_state.vm_clock_seqlock); if (timers_state.cpu_ticks_enabled) { - timers_state.cpu_ticks_offset = cpu_get_ticks(); - timers_state.cpu_clock_offset = cpu_get_clock(); + timers_state.cpu_ticks_offset += cpu_get_real_ticks(); + timers_state.cpu_clock_offset = cpu_get_clock_locked(); timers_state.cpu_ticks_enabled = 0; } + seqlock_write_unlock(&timers_state.vm_clock_seqlock); } /* Correlation between real and virtual time is always going to be @@ -196,13 +254,19 @@ static void icount_adjust(void) int64_t cur_time; int64_t cur_icount; int64_t delta; + + /* Protected by TimersState mutex. */ static int64_t last_delta; + /* If the VM is not running, then do nothing. */ if (!runstate_is_running()) { return; } - cur_time = cpu_get_clock(); - cur_icount = qemu_get_clock_ns(vm_clock); + + seqlock_write_lock(&timers_state.vm_clock_seqlock); + cur_time = cpu_get_clock_locked(); + cur_icount = cpu_get_icount_locked(); + delta = cur_icount - cur_time; /* FIXME: This is a very crude algorithm, somewhat prone to oscillation. */ if (delta > 0 @@ -219,19 +283,21 @@ static void icount_adjust(void) } last_delta = delta; qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift); + seqlock_write_unlock(&timers_state.vm_clock_seqlock); } static void icount_adjust_rt(void *opaque) { - qemu_mod_timer(icount_rt_timer, - qemu_get_clock_ms(rt_clock) + 1000); + timer_mod(icount_rt_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000); icount_adjust(); } static void icount_adjust_vm(void *opaque) { - qemu_mod_timer(icount_vm_timer, - qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10); + timer_mod(icount_vm_timer, + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + + get_ticks_per_sec() / 10); icount_adjust(); } @@ -242,48 +308,59 @@ static int64_t qemu_icount_round(int64_t count) static void icount_warp_rt(void *opaque) { - if (vm_clock_warp_start == -1) { + /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start + * changes from -1 to another value, so the race here is okay. + */ + if (atomic_read(&vm_clock_warp_start) == -1) { return; } + seqlock_write_lock(&timers_state.vm_clock_seqlock); if (runstate_is_running()) { - int64_t clock = qemu_get_clock_ns(rt_clock); - int64_t warp_delta = clock - vm_clock_warp_start; - if (use_icount == 1) { - qemu_icount_bias += warp_delta; - } else { + int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + int64_t warp_delta; + + warp_delta = clock - vm_clock_warp_start; + if (use_icount == 2) { /* - * In adaptive mode, do not let the vm_clock run too + * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too * far ahead of real time. */ - int64_t cur_time = cpu_get_clock(); - int64_t cur_icount = qemu_get_clock_ns(vm_clock); + int64_t cur_time = cpu_get_clock_locked(); + int64_t cur_icount = cpu_get_icount_locked(); int64_t delta = cur_time - cur_icount; - qemu_icount_bias += MIN(warp_delta, delta); - } - if (qemu_clock_expired(vm_clock)) { - qemu_notify_event(); + warp_delta = MIN(warp_delta, delta); } + qemu_icount_bias += warp_delta; } vm_clock_warp_start = -1; + seqlock_write_unlock(&timers_state.vm_clock_seqlock); + + if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) { + qemu_clock_notify(QEMU_CLOCK_VIRTUAL); + } } void qtest_clock_warp(int64_t dest) { - int64_t clock = qemu_get_clock_ns(vm_clock); + int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); assert(qtest_enabled()); while (clock < dest) { - int64_t deadline = qemu_clock_deadline(vm_clock); + int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); int64_t warp = MIN(dest - clock, deadline); + seqlock_write_lock(&timers_state.vm_clock_seqlock); qemu_icount_bias += warp; - qemu_run_timers(vm_clock); - clock = qemu_get_clock_ns(vm_clock); + seqlock_write_unlock(&timers_state.vm_clock_seqlock); + + qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL); + clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); } - qemu_notify_event(); + qemu_clock_notify(QEMU_CLOCK_VIRTUAL); } -void qemu_clock_warp(QEMUClock *clock) +void qemu_clock_warp(QEMUClockType type) { + int64_t clock; int64_t deadline; /* @@ -291,20 +368,20 @@ void qemu_clock_warp(QEMUClock *clock) * applicable to other clocks. But a clock argument removes the * need for if statements all over the place. */ - if (clock != vm_clock || !use_icount) { + if (type != QEMU_CLOCK_VIRTUAL || !use_icount) { return; } /* - * If the CPUs have been sleeping, advance the vm_clock timer now. This - * ensures that the deadline for the timer is computed correctly below. + * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now. + * This ensures that the deadline for the timer is computed correctly below. * This also makes sure that the insn counter is synchronized before the * CPU starts running, in case the CPU is woken by an event other than - * the earliest vm_clock timer. + * the earliest QEMU_CLOCK_VIRTUAL timer. */ icount_warp_rt(NULL); - if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) { - qemu_del_timer(icount_warp_timer); + timer_del(icount_warp_timer); + if (!all_cpu_threads_idle()) { return; } @@ -313,28 +390,39 @@ void qemu_clock_warp(QEMUClock *clock) return; } - vm_clock_warp_start = qemu_get_clock_ns(rt_clock); - deadline = qemu_clock_deadline(vm_clock); + /* We want to use the earliest deadline from ALL vm_clocks */ + clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); + if (deadline < 0) { + return; + } + if (deadline > 0) { /* - * Ensure the vm_clock proceeds even when the virtual CPU goes to + * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to * sleep. Otherwise, the CPU might be waiting for a future timer * interrupt to wake it up, but the interrupt never comes because * the vCPU isn't running any insns and thus doesn't advance the - * vm_clock. + * QEMU_CLOCK_VIRTUAL. * * An extreme solution for this problem would be to never let VCPUs - * sleep in icount mode if there is a pending vm_clock timer; rather - * time could just advance to the next vm_clock event. Instead, we - * do stop VCPUs and only advance vm_clock after some "real" time, - * (related to the time left until the next event) has passed. This - * rt_clock timer will do this. This avoids that the warps are too - * visible externally---for example, you will not be sending network - * packets continuously instead of every 100ms. + * sleep in icount mode if there is a pending QEMU_CLOCK_VIRTUAL + * timer; rather time could just advance to the next QEMU_CLOCK_VIRTUAL + * event. Instead, we do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL + * after some e"real" time, (related to the time left until the next + * event) has passed. The QEMU_CLOCK_REALTIME timer will do this. + * This avoids that the warps are visible externally; for example, + * you will not be sending network packets continuously instead of + * every 100ms. */ - qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline); - } else { - qemu_notify_event(); + seqlock_write_lock(&timers_state.vm_clock_seqlock); + if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) { + vm_clock_warp_start = clock; + } + seqlock_write_unlock(&timers_state.vm_clock_seqlock); + timer_mod_anticipate(icount_warp_timer, clock + deadline); + } else if (deadline == 0) { + qemu_clock_notify(QEMU_CLOCK_VIRTUAL); } } @@ -342,8 +430,7 @@ static const VMStateDescription vmstate_timers = { .name = "timer", .version_id = 2, .minimum_version_id = 1, - .minimum_version_id_old = 1, - .fields = (VMStateField[]) { + .fields = (VMStateField[]) { VMSTATE_INT64(cpu_ticks_offset, TimersState), VMSTATE_INT64(dummy, TimersState), VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2), @@ -353,12 +440,14 @@ static const VMStateDescription vmstate_timers = { void configure_icount(const char *option) { + seqlock_init(&timers_state.vm_clock_seqlock, NULL); vmstate_register(NULL, 0, &vmstate_timers, &timers_state); if (!option) { return; } - icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL); + icount_warp_timer = timer_new_ns(QEMU_CLOCK_REALTIME, + icount_warp_rt, NULL); if (strcmp(option, "auto") != 0) { icount_time_shift = strtol(option, NULL, 0); use_icount = 1; @@ -376,12 +465,15 @@ void configure_icount(const char *option) the virtual time trigger catches emulated time passing too fast. Realtime triggers occur even when idle, so use them less frequently than VM triggers. */ - icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL); - qemu_mod_timer(icount_rt_timer, - qemu_get_clock_ms(rt_clock) + 1000); - icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL); - qemu_mod_timer(icount_vm_timer, - qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10); + icount_rt_timer = timer_new_ms(QEMU_CLOCK_REALTIME, + icount_adjust_rt, NULL); + timer_mod(icount_rt_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + 1000); + icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, + icount_adjust_vm, NULL); + timer_mod(icount_vm_timer, + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + + get_ticks_per_sec() / 10); } /***********************************************************/ @@ -394,7 +486,7 @@ void hw_error(const char *fmt, ...) fprintf(stderr, "qemu: hardware error: "); vfprintf(stderr, fmt, ap); fprintf(stderr, "\n"); - for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { fprintf(stderr, "CPU #%d:\n", cpu->cpu_index); cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU); } @@ -406,7 +498,7 @@ void cpu_synchronize_all_states(void) { CPUState *cpu; - for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { cpu_synchronize_state(cpu); } } @@ -415,7 +507,7 @@ void cpu_synchronize_all_post_reset(void) { CPUState *cpu; - for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { cpu_synchronize_post_reset(cpu); } } @@ -424,16 +516,11 @@ void cpu_synchronize_all_post_init(void) { CPUState *cpu; - for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { cpu_synchronize_post_init(cpu); } } -bool cpu_is_stopped(CPUState *cpu) -{ - return !runstate_is_running() || cpu->stopped; -} - static int do_vm_stop(RunState state) { int ret = 0; @@ -457,7 +544,7 @@ static bool cpu_can_run(CPUState *cpu) if (cpu->stop) { return false; } - if (cpu->stopped || !runstate_is_running()) { + if (cpu_is_stopped(cpu)) { return false; } return true; @@ -735,7 +822,7 @@ static void qemu_tcg_wait_io_event(void) while (all_cpu_threads_idle()) { /* Start accounting real time to the virtual clock if the CPUs are idle. */ - qemu_clock_warp(vm_clock); + qemu_clock_warp(QEMU_CLOCK_VIRTUAL); qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex); } @@ -743,7 +830,7 @@ static void qemu_tcg_wait_io_event(void) qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex); } - for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { qemu_wait_io_event_common(cpu); } } @@ -837,12 +924,6 @@ static void *qemu_dummy_cpu_thread_fn(void *arg) static void tcg_exec_all(void); -static void tcg_signal_cpu_creation(CPUState *cpu, void *data) -{ - cpu->thread_id = qemu_get_thread_id(); - cpu->created = true; -} - static void *qemu_tcg_cpu_thread_fn(void *arg) { CPUState *cpu = arg; @@ -851,23 +932,31 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) qemu_thread_get_self(cpu->thread); qemu_mutex_lock(&qemu_global_mutex); - qemu_for_each_cpu(tcg_signal_cpu_creation, NULL); + CPU_FOREACH(cpu) { + cpu->thread_id = qemu_get_thread_id(); + cpu->created = true; + } qemu_cond_signal(&qemu_cpu_cond); /* wait for initial kick-off after machine start */ - while (first_cpu->stopped) { + while (QTAILQ_FIRST(&cpus)->stopped) { qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex); /* process any pending work */ - for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { qemu_wait_io_event_common(cpu); } } while (1) { tcg_exec_all(); - if (use_icount && qemu_clock_deadline(vm_clock) <= 0) { - qemu_notify_event(); + + if (use_icount) { + int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); + + if (deadline == 0) { + qemu_clock_notify(QEMU_CLOCK_VIRTUAL); + } } qemu_tcg_wait_io_event(); } @@ -969,13 +1058,12 @@ void qemu_mutex_unlock_iothread(void) static int all_vcpus_paused(void) { - CPUState *cpu = first_cpu; + CPUState *cpu; - while (cpu) { + CPU_FOREACH(cpu) { if (!cpu->stopped) { return 0; } - cpu = cpu->next_cpu; } return 1; @@ -983,23 +1071,20 @@ static int all_vcpus_paused(void) void pause_all_vcpus(void) { - CPUState *cpu = first_cpu; + CPUState *cpu; - qemu_clock_enable(vm_clock, false); - while (cpu) { + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false); + CPU_FOREACH(cpu) { cpu->stop = true; qemu_cpu_kick(cpu); - cpu = cpu->next_cpu; } if (qemu_in_vcpu_thread()) { cpu_stop_current(); if (!kvm_enabled()) { - cpu = first_cpu; - while (cpu) { + CPU_FOREACH(cpu) { cpu->stop = false; cpu->stopped = true; - cpu = cpu->next_cpu; } return; } @@ -1007,10 +1092,8 @@ void pause_all_vcpus(void) while (!all_vcpus_paused()) { qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex); - cpu = first_cpu; - while (cpu) { + CPU_FOREACH(cpu) { qemu_cpu_kick(cpu); - cpu = cpu->next_cpu; } } } @@ -1024,25 +1107,33 @@ void cpu_resume(CPUState *cpu) void resume_all_vcpus(void) { - CPUState *cpu = first_cpu; + CPUState *cpu; - qemu_clock_enable(vm_clock, true); - while (cpu) { + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true); + CPU_FOREACH(cpu) { cpu_resume(cpu); - cpu = cpu->next_cpu; } } +/* For temporary buffers for forming a name */ +#define VCPU_THREAD_NAME_SIZE 16 + static void qemu_tcg_init_vcpu(CPUState *cpu) { + char thread_name[VCPU_THREAD_NAME_SIZE]; + + tcg_cpu_address_space_init(cpu, cpu->as); + /* share a single thread for all cpus with TCG */ if (!tcg_cpu_thread) { cpu->thread = g_malloc0(sizeof(QemuThread)); cpu->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(cpu->halt_cond); tcg_halt_cond = cpu->halt_cond; - qemu_thread_create(cpu->thread, qemu_tcg_cpu_thread_fn, cpu, - QEMU_THREAD_JOINABLE); + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG", + cpu->cpu_index); + qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn, + cpu, QEMU_THREAD_JOINABLE); #ifdef _WIN32 cpu->hThread = qemu_thread_get_handle(cpu->thread); #endif @@ -1058,11 +1149,15 @@ static void qemu_tcg_init_vcpu(CPUState *cpu) static void qemu_kvm_start_vcpu(CPUState *cpu) { + char thread_name[VCPU_THREAD_NAME_SIZE]; + cpu->thread = g_malloc0(sizeof(QemuThread)); cpu->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(cpu->halt_cond); - qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, cpu, - QEMU_THREAD_JOINABLE); + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM", + cpu->cpu_index); + qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn, + cpu, QEMU_THREAD_JOINABLE); while (!cpu->created) { qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex); } @@ -1070,10 +1165,14 @@ static void qemu_kvm_start_vcpu(CPUState *cpu) static void qemu_dummy_start_vcpu(CPUState *cpu) { + char thread_name[VCPU_THREAD_NAME_SIZE]; + cpu->thread = g_malloc0(sizeof(QemuThread)); cpu->halt_cond = g_malloc0(sizeof(QemuCond)); qemu_cond_init(cpu->halt_cond); - qemu_thread_create(cpu->thread, qemu_dummy_cpu_thread_fn, cpu, + snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY", + cpu->cpu_index); + qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu, QEMU_THREAD_JOINABLE); while (!cpu->created) { qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex); @@ -1135,6 +1234,7 @@ int vm_stop_force_state(RunState state) static int tcg_cpu_exec(CPUArchState *env) { + CPUState *cpu = ENV_GET_CPU(env); int ret; #ifdef CONFIG_PROFILER int64_t ti; @@ -1145,16 +1245,28 @@ static int tcg_cpu_exec(CPUArchState *env) #endif if (use_icount) { int64_t count; + int64_t deadline; int decr; - qemu_icount -= (env->icount_decr.u16.low + env->icount_extra); - env->icount_decr.u16.low = 0; - env->icount_extra = 0; - count = qemu_icount_round(qemu_clock_deadline(vm_clock)); + qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra); + cpu->icount_decr.u16.low = 0; + cpu->icount_extra = 0; + deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL); + + /* Maintain prior (possibly buggy) behaviour where if no deadline + * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than + * INT32_MAX nanoseconds ahead, we still use INT32_MAX + * nanoseconds. + */ + if ((deadline < 0) || (deadline > INT32_MAX)) { + deadline = INT32_MAX; + } + + count = qemu_icount_round(deadline); qemu_icount += count; decr = (count > 0xffff) ? 0xffff : count; count -= decr; - env->icount_decr.u16.low = decr; - env->icount_extra = count; + cpu->icount_decr.u16.low = decr; + cpu->icount_extra = count; } ret = cpu_exec(env); #ifdef CONFIG_PROFILER @@ -1163,10 +1275,9 @@ static int tcg_cpu_exec(CPUArchState *env) if (use_icount) { /* Fold pending instructions back into the instruction counter, and clear the interrupt flag. */ - qemu_icount -= (env->icount_decr.u16.low - + env->icount_extra); - env->icount_decr.u32 = 0; - env->icount_extra = 0; + qemu_icount -= (cpu->icount_decr.u16.low + cpu->icount_extra); + cpu->icount_decr.u32 = 0; + cpu->icount_extra = 0; } return ret; } @@ -1175,17 +1286,17 @@ static void tcg_exec_all(void) { int r; - /* Account partial waits to the vm_clock. */ - qemu_clock_warp(vm_clock); + /* Account partial waits to QEMU_CLOCK_VIRTUAL. */ + qemu_clock_warp(QEMU_CLOCK_VIRTUAL); if (next_cpu == NULL) { next_cpu = first_cpu; } - for (; next_cpu != NULL && !exit_request; next_cpu = next_cpu->next_cpu) { + for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) { CPUState *cpu = next_cpu; CPUArchState *env = cpu->env_ptr; - qemu_clock_enable(vm_clock, + qemu_clock_enable(QEMU_CLOCK_VIRTUAL, (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0); if (cpu_can_run(cpu)) { @@ -1201,20 +1312,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ - CPUState *cpu; - int i; - - for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { - for (i = 0; i < nb_numa_nodes; i++) { - if (test_bit(cpu->cpu_index, node_cpumask[i])) { - cpu->numa_node = i; - } - } - } -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ @@ -1228,7 +1325,7 @@ CpuInfoList *qmp_query_cpus(Error **errp) CpuInfoList *head = NULL, *cur_item = NULL; CPUState *cpu; - for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { CpuInfoList *info; #if defined(TARGET_I386) X86CPU *x86_cpu = X86_CPU(cpu); @@ -1309,7 +1406,10 @@ void qmp_memsave(int64_t addr, int64_t size, const char *filename, l = sizeof(buf); if (l > size) l = size; - cpu_memory_rw_debug(cpu, addr, buf, l, 0); + if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) { + error_setg(errp, "Invalid addr 0x%016" PRIx64 "specified", addr); + goto exit; + } if (fwrite(buf, 1, l, f) != l) { error_set(errp, QERR_IO_ERROR); goto exit; @@ -1339,7 +1439,7 @@ void qmp_pmemsave(int64_t addr, int64_t size, const char *filename, l = sizeof(buf); if (l > size) l = size; - cpu_physical_memory_rw(addr, buf, l, 0); + cpu_physical_memory_read(addr, buf, l); if (fwrite(buf, 1, l, f) != l) { error_set(errp, QERR_IO_ERROR); goto exit; @@ -1357,14 +1457,27 @@ void qmp_inject_nmi(Error **errp) #if defined(TARGET_I386) CPUState *cs; - for (cs = first_cpu; cs != NULL; cs = cs->next_cpu) { + CPU_FOREACH(cs) { X86CPU *cpu = X86_CPU(cs); - CPUX86State *env = &cpu->env; - if (!env->apic_state) { + if (!cpu->apic_state) { cpu_interrupt(cs, CPU_INTERRUPT_NMI); } else { - apic_deliver_nmi(env->apic_state); + apic_deliver_nmi(cpu->apic_state); + } + } +#elif defined(TARGET_S390X) + CPUState *cs; + S390CPU *cpu; + + CPU_FOREACH(cs) { + cpu = S390_CPU(cs); + if (cpu->env.cpu_num == monitor_get_cpu_index()) { + if (s390_cpu_restart(S390_CPU(cs)) == -1) { + error_set(errp, QERR_UNSUPPORTED); + return; + } + break; } } #else diff --git a/cputlb.c b/cputlb.c index 977c0ca59d..7bd3573025 100644 --- a/cputlb.c +++ b/cputlb.c @@ -26,6 +26,7 @@ #include "exec/cputlb.h" #include "exec/memory-internal.h" +#include "exec/ram_addr.h" //#define DEBUG_TLB //#define DEBUG_TLB_CHECK @@ -33,13 +34,6 @@ /* statistics */ int tlb_flush_count; -static const CPUTLBEntry s_cputlb_empty_entry = { - .addr_read = -1, - .addr_write = -1, - .addr_code = -1, - .addend = -1, -}; - /* NOTE: * If flush_global is true (the usual case), flush all tlb entries. * If flush_global is false, flush (at least) all tlb entries not @@ -52,10 +46,9 @@ static const CPUTLBEntry s_cputlb_empty_entry = { * entries from the TLB at any time, so flushing more entries than * required is only an efficiency issue, not a correctness issue. */ -void tlb_flush(CPUArchState *env, int flush_global) +void tlb_flush(CPUState *cpu, int flush_global) { - CPUState *cpu = ENV_GET_CPU(env); - int i; + CPUArchState *env = cpu->env_ptr; #if defined(DEBUG_TLB) printf("tlb_flush:\n"); @@ -64,15 +57,8 @@ void tlb_flush(CPUArchState *env, int flush_global) links while we are modifying them */ cpu->current_tb = NULL; - for (i = 0; i < CPU_TLB_SIZE; i++) { - int mmu_idx; - - for (mmu_idx = 0; mmu_idx < NB_MMU_MODES; mmu_idx++) { - env->tlb_table[mmu_idx][i] = s_cputlb_empty_entry; - } - } - - memset(env->tb_jmp_cache, 0, TB_JMP_CACHE_SIZE * sizeof (void *)); + memset(env->tlb_table, -1, sizeof(env->tlb_table)); + memset(cpu->tb_jmp_cache, 0, sizeof(cpu->tb_jmp_cache)); env->tlb_flush_addr = -1; env->tlb_flush_mask = 0; @@ -87,13 +73,13 @@ static inline void tlb_flush_entry(CPUTLBEntry *tlb_entry, target_ulong addr) (TARGET_PAGE_MASK | TLB_INVALID_MASK)) || addr == (tlb_entry->addr_code & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) { - *tlb_entry = s_cputlb_empty_entry; + memset(tlb_entry, -1, sizeof(*tlb_entry)); } } -void tlb_flush_page(CPUArchState *env, target_ulong addr) +void tlb_flush_page(CPUState *cpu, target_ulong addr) { - CPUState *cpu = ENV_GET_CPU(env); + CPUArchState *env = cpu->env_ptr; int i; int mmu_idx; @@ -107,7 +93,7 @@ void tlb_flush_page(CPUArchState *env, target_ulong addr) TARGET_FMT_lx "/" TARGET_FMT_lx ")\n", env->tlb_flush_addr, env->tlb_flush_mask); #endif - tlb_flush(env, 1); + tlb_flush(cpu, 1); return; } /* must reset current TB so that interrupts cannot modify the @@ -120,24 +106,23 @@ void tlb_flush_page(CPUArchState *env, target_ulong addr) tlb_flush_entry(&env->tlb_table[mmu_idx][i], addr); } - tb_flush_jmp_cache(env, addr); + tb_flush_jmp_cache(cpu, addr); } /* update the TLBs so that writes to code in the virtual page 'addr' can be detected */ void tlb_protect_code(ram_addr_t ram_addr) { - cpu_physical_memory_reset_dirty(ram_addr, - ram_addr + TARGET_PAGE_SIZE, - CODE_DIRTY_FLAG); + cpu_physical_memory_reset_dirty(ram_addr, TARGET_PAGE_SIZE, + DIRTY_MEMORY_CODE); } /* update the TLB so that writes in physical page 'phys_addr' are no longer tested for self modifying code */ -void tlb_unprotect_code_phys(CPUArchState *env, ram_addr_t ram_addr, +void tlb_unprotect_code_phys(CPUState *cpu, ram_addr_t ram_addr, target_ulong vaddr) { - cpu_physical_memory_set_dirty_flags(ram_addr, CODE_DIRTY_FLAG); + cpu_physical_memory_set_dirty_flag(ram_addr, DIRTY_MEMORY_CODE); } static bool tlb_is_dirty_ram(CPUTLBEntry *tlbe) @@ -169,27 +154,12 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr) return ram_addr; } -static inline void tlb_update_dirty(CPUTLBEntry *tlb_entry) -{ - ram_addr_t ram_addr; - void *p; - - if (tlb_is_dirty_ram(tlb_entry)) { - p = (void *)(uintptr_t)((tlb_entry->addr_write & TARGET_PAGE_MASK) - + tlb_entry->addend); - ram_addr = qemu_ram_addr_from_host_nofail(p); - if (!cpu_physical_memory_is_dirty(ram_addr)) { - tlb_entry->addr_write |= TLB_NOTDIRTY; - } - } -} - void cpu_tlb_reset_dirty_all(ram_addr_t start1, ram_addr_t length) { CPUState *cpu; CPUArchState *env; - for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { + CPU_FOREACH(cpu) { int mmu_idx; env = cpu->env_ptr; @@ -251,10 +221,11 @@ static void tlb_add_large_page(CPUArchState *env, target_ulong vaddr, /* Add a new TLB entry. At most one entry for a given virtual address is permitted. Only a single TARGET_PAGE_SIZE region is mapped, the supplied size is only used by tlb_flush_page. */ -void tlb_set_page(CPUArchState *env, target_ulong vaddr, +void tlb_set_page(CPUState *cpu, target_ulong vaddr, hwaddr paddr, int prot, int mmu_idx, target_ulong size) { + CPUArchState *env = cpu->env_ptr; MemoryRegionSection *section; unsigned int index; target_ulong address; @@ -269,7 +240,7 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr, } sz = size; - section = address_space_translate_for_iotlb(&address_space_memory, paddr, + section = address_space_translate_for_iotlb(cpu->as, paddr, &xlat, &sz); assert(sz >= TARGET_PAGE_SIZE); @@ -290,7 +261,7 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr, } code_address = address; - iotlb = memory_region_section_get_iotlb(env, section, vaddr, paddr, xlat, + iotlb = memory_region_section_get_iotlb(cpu, section, vaddr, paddr, xlat, prot, &address); index = (vaddr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); @@ -314,7 +285,8 @@ void tlb_set_page(CPUArchState *env, target_ulong vaddr, /* Write access calls the I/O callback. */ te->addr_write = address | TLB_MMIO; } else if (memory_region_is_ram(section->mr) - && !cpu_physical_memory_is_dirty(section->mr->ram_addr + xlat)) { + && cpu_physical_memory_is_clean(section->mr->ram_addr + + xlat)) { te->addr_write = address | TLB_NOTDIRTY; } else { te->addr_write = address; @@ -334,6 +306,7 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr) int mmu_idx, page_index, pd; void *p; MemoryRegion *mr; + CPUState *cpu = ENV_GET_CPU(env1); page_index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1); mmu_idx = cpu_mmu_index(env1); @@ -342,15 +315,14 @@ tb_page_addr_t get_page_addr_code(CPUArchState *env1, target_ulong addr) cpu_ldub_code(env1, addr); } pd = env1->iotlb[mmu_idx][page_index] & ~TARGET_PAGE_MASK; - mr = iotlb_to_region(pd); + mr = iotlb_to_region(cpu->as, pd); if (memory_region_is_unassigned(mr)) { - CPUState *cpu = ENV_GET_CPU(env1); CPUClass *cc = CPU_GET_CLASS(cpu); if (cc->do_unassigned_access) { cc->do_unassigned_access(cpu, addr, false, true, 0, 4); } else { - cpu_abort(env1, "Trying to execute code outside RAM or ROM at 0x" + cpu_abort(cpu, "Trying to execute code outside RAM or ROM at 0x" TARGET_FMT_lx "\n", addr); } } diff --git a/default-configs/aarch64-linux-user.mak b/default-configs/aarch64-linux-user.mak new file mode 100644 index 0000000000..3df7de5b8f --- /dev/null +++ b/default-configs/aarch64-linux-user.mak @@ -0,0 +1,3 @@ +# Default configuration for aarch64-linux-user + +CONFIG_GDBSTUB_XML=y diff --git a/default-configs/aarch64-softmmu.mak b/default-configs/aarch64-softmmu.mak new file mode 100644 index 0000000000..6d3b5c7a46 --- /dev/null +++ b/default-configs/aarch64-softmmu.mak @@ -0,0 +1,6 @@ +# Default configuration for aarch64-softmmu + +# We support all the 32 bit boards so need all their config +include arm-softmmu.mak + +# Currently no 64-bit specific config requirements diff --git a/default-configs/arm-linux-user.mak b/default-configs/arm-linux-user.mak index 46d4aa2d71..413361a022 100644 --- a/default-configs/arm-linux-user.mak +++ b/default-configs/arm-linux-user.mak @@ -1,3 +1 @@ # Default configuration for arm-linux-user - -CONFIG_GDBSTUB_XML=y diff --git a/default-configs/arm-softmmu.mak b/default-configs/arm-softmmu.mak index 27cbe3d088..f3513fa124 100644 --- a/default-configs/arm-softmmu.mak +++ b/default-configs/arm-softmmu.mak @@ -2,7 +2,6 @@ include pci.mak include usb.mak -CONFIG_GDBSTUB_XML=y CONFIG_VGA=y CONFIG_ISA_MMIO=y CONFIG_NAND=y @@ -28,20 +27,22 @@ CONFIG_SSI_SD=y CONFIG_SSI_M25P80=y CONFIG_LAN9118=y CONFIG_SMC91C111=y +CONFIG_ALLWINNER_EMAC=y CONFIG_DS1338=y CONFIG_PFLASH_CFI01=y CONFIG_PFLASH_CFI02=y CONFIG_MICRODRIVE=y CONFIG_USB_MUSB=y -CONFIG_ARM9MPCORE=y CONFIG_ARM11MPCORE=y -CONFIG_ARM15MPCORE=y +CONFIG_A9MPCORE=y +CONFIG_A15MPCORE=y CONFIG_ARM_GIC=y CONFIG_ARM_GIC_KVM=$(CONFIG_KVM) CONFIG_ARM_TIMER=y CONFIG_ARM_MPTIMER=y +CONFIG_A9_GTIMER=y CONFIG_PL011=y CONFIG_PL022=y CONFIG_PL031=y @@ -62,7 +63,9 @@ CONFIG_BITBANG_I2C=y CONFIG_FRAMEBUFFER=y CONFIG_XILINX_SPIPS=y +CONFIG_ARM11SCU=y CONFIG_A9SCU=y +CONFIG_DIGIC=y CONFIG_MARVELL_88W8618=y CONFIG_OMAP=y CONFIG_TSC210X=y @@ -80,3 +83,8 @@ CONFIG_VERSATILE_PCI=y CONFIG_VERSATILE_I2C=y CONFIG_SDHCI=y +CONFIG_INTEGRATOR_DEBUG=y + +CONFIG_ALLWINNER_A10_PIT=y +CONFIG_ALLWINNER_A10_PIC=y +CONFIG_ALLWINNER_A10=y diff --git a/default-configs/armeb-linux-user.mak b/default-configs/armeb-linux-user.mak index 41d0cc4926..bf2ffe7038 100644 --- a/default-configs/armeb-linux-user.mak +++ b/default-configs/armeb-linux-user.mak @@ -1,3 +1 @@ # Default configuration for armeb-linux-user - -CONFIG_GDBSTUB_XML=y diff --git a/default-configs/m68k-linux-user.mak b/default-configs/m68k-linux-user.mak index f3487aa3d9..06cd5ed7ed 100644 --- a/default-configs/m68k-linux-user.mak +++ b/default-configs/m68k-linux-user.mak @@ -1,3 +1 @@ # Default configuration for m68k-linux-user - -CONFIG_GDBSTUB_XML=y diff --git a/default-configs/m68k-softmmu.mak b/default-configs/m68k-softmmu.mak index 51fe5bb321..d9552df076 100644 --- a/default-configs/m68k-softmmu.mak +++ b/default-configs/m68k-softmmu.mak @@ -3,5 +3,4 @@ include pci.mak include usb.mak CONFIG_COLDFIRE=y -CONFIG_GDBSTUB_XML=y CONFIG_PTIMER=y diff --git a/default-configs/mips-softmmu.mak b/default-configs/mips-softmmu.mak index 926709ae52..71177efdff 100644 --- a/default-configs/mips-softmmu.mak +++ b/default-configs/mips-softmmu.mak @@ -34,3 +34,4 @@ CONFIG_JAZZ_LED=y CONFIG_MC146818RTC=y CONFIG_VT82C686=y CONFIG_ISA_TESTDEV=y +CONFIG_EMPTY_SLOT=y diff --git a/default-configs/mips64-softmmu.mak b/default-configs/mips64-softmmu.mak index 0ef3f09c77..617301b753 100644 --- a/default-configs/mips64-softmmu.mak +++ b/default-configs/mips64-softmmu.mak @@ -34,3 +34,4 @@ CONFIG_JAZZ_LED=y CONFIG_MC146818RTC=y CONFIG_VT82C686=y CONFIG_ISA_TESTDEV=y +CONFIG_EMPTY_SLOT=y diff --git a/default-configs/mips64el-softmmu.mak b/default-configs/mips64el-softmmu.mak index 60893182b3..317b151361 100644 --- a/default-configs/mips64el-softmmu.mak +++ b/default-configs/mips64el-softmmu.mak @@ -36,3 +36,4 @@ CONFIG_JAZZ_LED=y CONFIG_MC146818RTC=y CONFIG_VT82C686=y CONFIG_ISA_TESTDEV=y +CONFIG_EMPTY_SLOT=y diff --git a/default-configs/mipsel-softmmu.mak b/default-configs/mipsel-softmmu.mak index cd59e2479a..532a9aefbd 100644 --- a/default-configs/mipsel-softmmu.mak +++ b/default-configs/mipsel-softmmu.mak @@ -34,3 +34,4 @@ CONFIG_JAZZ_LED=y CONFIG_MC146818RTC=y CONFIG_VT82C686=y CONFIG_ISA_TESTDEV=y +CONFIG_EMPTY_SLOT=y diff --git a/default-configs/ppc-linux-user.mak b/default-configs/ppc-linux-user.mak index 681a94598e..6273df2930 100644 --- a/default-configs/ppc-linux-user.mak +++ b/default-configs/ppc-linux-user.mak @@ -1,3 +1 @@ # Default configuration for ppc-linux-user - -CONFIG_GDBSTUB_XML=y diff --git a/default-configs/ppc-softmmu.mak b/default-configs/ppc-softmmu.mak index eac0b28fb9..07c51ced1f 100644 --- a/default-configs/ppc-softmmu.mak +++ b/default-configs/ppc-softmmu.mak @@ -3,7 +3,6 @@ include pci.mak include sound.mak include usb.mak -CONFIG_GDBSTUB_XML=y CONFIG_ISA_MMIO=y CONFIG_ESCC=y CONFIG_M48T59=y @@ -42,8 +41,11 @@ CONFIG_I8259=y CONFIG_XILINX=y CONFIG_XILINX_ETHLITE=y CONFIG_OPENPIC=y +CONFIG_PREP=y +CONFIG_MAC=y CONFIG_E500=y CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM)) # For PReP CONFIG_MC146818RTC=y +CONFIG_ETSEC=y CONFIG_ISA_TESTDEV=y diff --git a/default-configs/ppc64-linux-user.mak b/default-configs/ppc64-linux-user.mak index 089c08f3a0..422d3fbaeb 100644 --- a/default-configs/ppc64-linux-user.mak +++ b/default-configs/ppc64-linux-user.mak @@ -1,3 +1 @@ # Default configuration for ppc64-linux-user - -CONFIG_GDBSTUB_XML=y diff --git a/default-configs/ppc64-softmmu.mak b/default-configs/ppc64-softmmu.mak index 7831c2bf57..e2beac6df1 100644 --- a/default-configs/ppc64-softmmu.mak +++ b/default-configs/ppc64-softmmu.mak @@ -3,7 +3,6 @@ include pci.mak include sound.mak include usb.mak -CONFIG_GDBSTUB_XML=y CONFIG_ISA_MMIO=y CONFIG_ESCC=y CONFIG_M48T59=y @@ -43,10 +42,13 @@ CONFIG_XILINX=y CONFIG_XILINX_ETHLITE=y CONFIG_OPENPIC=y CONFIG_PSERIES=y +CONFIG_PREP=y +CONFIG_MAC=y CONFIG_E500=y CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM)) # For pSeries CONFIG_XICS=$(CONFIG_PSERIES) +CONFIG_XICS_KVM=$(and $(CONFIG_PSERIES),$(CONFIG_KVM)) # For PReP CONFIG_I82378=y CONFIG_I8259=y diff --git a/default-configs/ppc64abi32-linux-user.mak b/default-configs/ppc64abi32-linux-user.mak index f038ffd97c..1c657ec9bb 100644 --- a/default-configs/ppc64abi32-linux-user.mak +++ b/default-configs/ppc64abi32-linux-user.mak @@ -1,3 +1 @@ # Default configuration for ppc64abi32-linux-user - -CONFIG_GDBSTUB_XML=y diff --git a/default-configs/ppcemb-softmmu.mak b/default-configs/ppcemb-softmmu.mak index 86080a7574..1e4fde2190 100644 --- a/default-configs/ppcemb-softmmu.mak +++ b/default-configs/ppcemb-softmmu.mak @@ -3,33 +3,12 @@ include pci.mak include sound.mak include usb.mak -CONFIG_GDBSTUB_XML=y -CONFIG_ISA_MMIO=y -CONFIG_ESCC=y CONFIG_M48T59=y CONFIG_VGA=y CONFIG_VGA_PCI=y CONFIG_SERIAL=y -CONFIG_I8254=y -CONFIG_PCKBD=y -CONFIG_FDC=y CONFIG_I8257=y CONFIG_OPENPIC=y -CONFIG_PREP_PCI=y -CONFIG_MACIO=y -CONFIG_CUDA=y -CONFIG_ADB=y -CONFIG_MAC_NVRAM=y -CONFIG_MAC_DBDMA=y -CONFIG_HEATHROW_PIC=y -CONFIG_GRACKLE_PCI=y -CONFIG_UNIN_PCI=y -CONFIG_DEC_PCI=y -CONFIG_PPCE500_PCI=y -CONFIG_IDE_ISA=y -CONFIG_IDE_CMD646=y -CONFIG_IDE_MACIO=y -CONFIG_NE2000_ISA=y CONFIG_PFLASH_CFI01=y CONFIG_PFLASH_CFI02=y CONFIG_PTIMER=y @@ -37,8 +16,3 @@ CONFIG_I8259=y CONFIG_XILINX=y CONFIG_XILINX_ETHLITE=y CONFIG_OPENPIC=y -CONFIG_E500=y -CONFIG_OPENPIC_KVM=$(and $(CONFIG_E500),$(CONFIG_KVM)) -# For PReP -CONFIG_MC146818RTC=y -CONFIG_ISA_TESTDEV=y diff --git a/default-configs/s390x-softmmu.mak b/default-configs/s390x-softmmu.mak index 81fbc68654..d843dc0d57 100644 --- a/default-configs/s390x-softmmu.mak +++ b/default-configs/s390x-softmmu.mak @@ -1,2 +1,3 @@ CONFIG_VIRTIO=y CONFIG_SCLPCONSOLE=y +CONFIG_S390_FLIC=$(CONFIG_KVM) diff --git a/default-configs/sparc-softmmu.mak b/default-configs/sparc-softmmu.mak index 8fc93dd643..ab796b3d4f 100644 --- a/default-configs/sparc-softmmu.mak +++ b/default-configs/sparc-softmmu.mak @@ -10,6 +10,7 @@ CONFIG_EMPTY_SLOT=y CONFIG_PCNET_COMMON=y CONFIG_LANCE=y CONFIG_TCX=y +CONFIG_CG3=y CONFIG_SLAVIO=y CONFIG_CS4231=y CONFIG_GRLIB=y diff --git a/default-configs/usb.mak b/default-configs/usb.mak index 1bf9075e85..73d84895aa 100644 --- a/default-configs/usb.mak +++ b/default-configs/usb.mak @@ -1,6 +1,7 @@ CONFIG_USB_TABLET_WACOM=y CONFIG_USB_STORAGE_BOT=y CONFIG_USB_STORAGE_UAS=y +CONFIG_USB_STORAGE_MTP=y CONFIG_USB_SMARTCARD=y CONFIG_USB_AUDIO=y CONFIG_USB_SERIAL=y diff --git a/device-hotplug.c b/device-hotplug.c index 103d34ac45..eecb08e2b1 100644 --- a/device-hotplug.c +++ b/device-hotplug.c @@ -33,12 +33,14 @@ DriveInfo *add_init_drive(const char *optstr) { DriveInfo *dinfo; QemuOpts *opts; + MachineClass *mc; opts = drive_def(optstr); if (!opts) return NULL; - dinfo = drive_init(opts, current_machine->block_default_type); + mc = MACHINE_GET_CLASS(current_machine); + dinfo = drive_init(opts, mc->block_default_type); if (!dinfo) { qemu_opts_del(opts); return NULL; diff --git a/device_tree.c b/device_tree.c index ffec99ae29..ca83504819 100644 --- a/device_tree.c +++ b/device_tree.c @@ -41,6 +41,10 @@ void *create_device_tree(int *sizep) if (ret < 0) { goto fail; } + ret = fdt_finish_reservemap(fdt); + if (ret < 0) { + goto fail; + } ret = fdt_begin_node(fdt, ""); if (ret < 0) { goto fail; @@ -127,12 +131,12 @@ static int findnode_nofail(void *fdt, const char *node_path) return offset; } -int qemu_devtree_setprop(void *fdt, const char *node_path, - const char *property, const void *val_array, int size) +int qemu_fdt_setprop(void *fdt, const char *node_path, + const char *property, const void *val, int size) { int r; - r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val_array, size); + r = fdt_setprop(fdt, findnode_nofail(fdt, node_path), property, val, size); if (r < 0) { fprintf(stderr, "%s: Couldn't set %s/%s: %s\n", __func__, node_path, property, fdt_strerror(r)); @@ -142,8 +146,8 @@ int qemu_devtree_setprop(void *fdt, const char *node_path, return r; } -int qemu_devtree_setprop_cell(void *fdt, const char *node_path, - const char *property, uint32_t val) +int qemu_fdt_setprop_cell(void *fdt, const char *node_path, + const char *property, uint32_t val) { int r; @@ -157,15 +161,15 @@ int qemu_devtree_setprop_cell(void *fdt, const char *node_path, return r; } -int qemu_devtree_setprop_u64(void *fdt, const char *node_path, - const char *property, uint64_t val) +int qemu_fdt_setprop_u64(void *fdt, const char *node_path, + const char *property, uint64_t val) { val = cpu_to_be64(val); - return qemu_devtree_setprop(fdt, node_path, property, &val, sizeof(val)); + return qemu_fdt_setprop(fdt, node_path, property, &val, sizeof(val)); } -int qemu_devtree_setprop_string(void *fdt, const char *node_path, - const char *property, const char *string) +int qemu_fdt_setprop_string(void *fdt, const char *node_path, + const char *property, const char *string) { int r; @@ -179,8 +183,8 @@ int qemu_devtree_setprop_string(void *fdt, const char *node_path, return r; } -const void *qemu_devtree_getprop(void *fdt, const char *node_path, - const char *property, int *lenp) +const void *qemu_fdt_getprop(void *fdt, const char *node_path, + const char *property, int *lenp) { int len; const void *r; @@ -196,11 +200,11 @@ const void *qemu_devtree_getprop(void *fdt, const char *node_path, return r; } -uint32_t qemu_devtree_getprop_cell(void *fdt, const char *node_path, - const char *property) +uint32_t qemu_fdt_getprop_cell(void *fdt, const char *node_path, + const char *property) { int len; - const uint32_t *p = qemu_devtree_getprop(fdt, node_path, property, &len); + const uint32_t *p = qemu_fdt_getprop(fdt, node_path, property, &len); if (len != 4) { fprintf(stderr, "%s: %s/%s not 4 bytes long (not a cell?)\n", __func__, node_path, property); @@ -209,7 +213,7 @@ uint32_t qemu_devtree_getprop_cell(void *fdt, const char *node_path, return be32_to_cpu(*p); } -uint32_t qemu_devtree_get_phandle(void *fdt, const char *path) +uint32_t qemu_fdt_get_phandle(void *fdt, const char *path) { uint32_t r; @@ -223,15 +227,15 @@ uint32_t qemu_devtree_get_phandle(void *fdt, const char *path) return r; } -int qemu_devtree_setprop_phandle(void *fdt, const char *node_path, - const char *property, - const char *target_node_path) +int qemu_fdt_setprop_phandle(void *fdt, const char *node_path, + const char *property, + const char *target_node_path) { - uint32_t phandle = qemu_devtree_get_phandle(fdt, target_node_path); - return qemu_devtree_setprop_cell(fdt, node_path, property, phandle); + uint32_t phandle = qemu_fdt_get_phandle(fdt, target_node_path); + return qemu_fdt_setprop_cell(fdt, node_path, property, phandle); } -uint32_t qemu_devtree_alloc_phandle(void *fdt) +uint32_t qemu_fdt_alloc_phandle(void *fdt) { static int phandle = 0x0; @@ -255,7 +259,7 @@ uint32_t qemu_devtree_alloc_phandle(void *fdt) return phandle++; } -int qemu_devtree_nop_node(void *fdt, const char *node_path) +int qemu_fdt_nop_node(void *fdt, const char *node_path) { int r; @@ -269,7 +273,7 @@ int qemu_devtree_nop_node(void *fdt, const char *node_path) return r; } -int qemu_devtree_add_subnode(void *fdt, const char *name) +int qemu_fdt_add_subnode(void *fdt, const char *name) { char *dupname = g_strdup(name); char *basename = strrchr(dupname, '/'); @@ -299,7 +303,7 @@ int qemu_devtree_add_subnode(void *fdt, const char *name) return retval; } -void qemu_devtree_dumpdtb(void *fdt, int size) +void qemu_fdt_dumpdtb(void *fdt, int size) { const char *dumpdtb = qemu_opt_get(qemu_get_machine_opts(), "dumpdtb"); @@ -309,11 +313,11 @@ void qemu_devtree_dumpdtb(void *fdt, int size) } } -int qemu_devtree_setprop_sized_cells_from_array(void *fdt, - const char *node_path, - const char *property, - int numvalues, - uint64_t *values) +int qemu_fdt_setprop_sized_cells_from_array(void *fdt, + const char *node_path, + const char *property, + int numvalues, + uint64_t *values) { uint32_t *propcells; uint64_t value; @@ -338,6 +342,6 @@ int qemu_devtree_setprop_sized_cells_from_array(void *fdt, propcells[cellnum++] = cpu_to_be32(value); } - return qemu_devtree_setprop(fdt, node_path, property, propcells, - cellnum * sizeof(uint32_t)); + return qemu_fdt_setprop(fdt, node_path, property, propcells, + cellnum * sizeof(uint32_t)); } diff --git a/disas.c b/disas.c index 71007fb6a1..79e694483c 100644 --- a/disas.c +++ b/disas.c @@ -158,10 +158,39 @@ print_insn_thumb1(bfd_vma pc, disassemble_info *info) } #endif +static int print_insn_objdump(bfd_vma pc, disassemble_info *info, + const char *prefix) +{ + int i, n = info->buffer_length; + uint8_t *buf = g_malloc(n); + + info->read_memory_func(pc, buf, n, info); + + for (i = 0; i < n; ++i) { + if (i % 32 == 0) { + info->fprintf_func(info->stream, "\n%s: ", prefix); + } + info->fprintf_func(info->stream, "%02x", buf[i]); + } + + g_free(buf); + return n; +} + +static int print_insn_od_host(bfd_vma pc, disassemble_info *info) +{ + return print_insn_objdump(pc, info, "OBJD-H"); +} + +static int print_insn_od_target(bfd_vma pc, disassemble_info *info) +{ + return print_insn_objdump(pc, info, "OBJD-T"); +} + /* Disassemble this for me please... (debugging). 'flags' has the following values: i386 - 1 means 16 bit code, 2 means 64 bit code - arm - bit 0 = thumb, bit 1 = reverse endian + arm - bit 0 = thumb, bit 1 = reverse endian, bit 2 = A64 ppc - nonzero means little endian other targets - unused */ @@ -171,7 +200,7 @@ void target_disas(FILE *out, CPUArchState *env, target_ulong code, target_ulong pc; int count; CPUDebug s; - int (*print_insn)(bfd_vma pc, disassemble_info *info); + int (*print_insn)(bfd_vma pc, disassemble_info *info) = NULL; INIT_DISASSEMBLE_INFO(s.info, out, fprintf); @@ -196,7 +225,15 @@ void target_disas(FILE *out, CPUArchState *env, target_ulong code, } print_insn = print_insn_i386; #elif defined(TARGET_ARM) - if (flags & 1) { + if (flags & 4) { + /* We might not be compiled with the A64 disassembler + * because it needs a C++ compiler; in that case we will + * fall through to the default print_insn_od case. + */ +#if defined(CONFIG_ARM_A64_DIS) + print_insn = print_insn_arm_a64; +#endif + } else if (flags & 1) { print_insn = print_insn_thumb1; } else { print_insn = print_insn_arm; @@ -263,11 +300,10 @@ void target_disas(FILE *out, CPUArchState *env, target_ulong code, #elif defined(TARGET_LM32) s.info.mach = bfd_mach_lm32; print_insn = print_insn_lm32; -#else - fprintf(out, "0x" TARGET_FMT_lx - ": Asm output not supported on this arch\n", code); - return; #endif + if (print_insn == NULL) { + print_insn = print_insn_od_target; + } for (pc = code; size > 0; pc += count, size -= count) { fprintf(out, "0x" TARGET_FMT_lx ": ", pc); @@ -303,7 +339,7 @@ void disas(FILE *out, void *code, unsigned long size) uintptr_t pc; int count; CPUDebug s; - int (*print_insn)(bfd_vma pc, disassemble_info *info); + int (*print_insn)(bfd_vma pc, disassemble_info *info) = NULL; INIT_DISASSEMBLE_INFO(s.info, out, fprintf); s.info.print_address_func = generic_print_host_address; @@ -328,6 +364,8 @@ void disas(FILE *out, void *code, unsigned long size) #elif defined(_ARCH_PPC) s.info.disassembler_options = (char *)"any"; print_insn = print_insn_ppc; +#elif defined(__aarch64__) && defined(CONFIG_ARM_A64_DIS) + print_insn = print_insn_arm_a64; #elif defined(__alpha__) print_insn = print_insn_alpha; #elif defined(__sparc__) @@ -347,11 +385,10 @@ void disas(FILE *out, void *code, unsigned long size) print_insn = print_insn_hppa; #elif defined(__ia64__) print_insn = print_insn_ia64; -#else - fprintf(out, "0x%lx: Asm output not supported on this arch\n", - (long) code); - return; #endif + if (print_insn == NULL) { + print_insn = print_insn_od_host; + } for (pc = (uintptr_t)code; size > 0; pc += count, size -= count) { fprintf(out, "0x%08" PRIxPTR ": ", pc); count = print_insn(pc, &s.info); diff --git a/disas/Makefile.objs b/disas/Makefile.objs index 3b1e77ace5..8dae4daec0 100644 --- a/disas/Makefile.objs +++ b/disas/Makefile.objs @@ -1,5 +1,10 @@ + common-obj-$(CONFIG_ALPHA_DIS) += alpha.o common-obj-$(CONFIG_ARM_DIS) += arm.o +common-obj-$(CONFIG_ARM_A64_DIS) += arm-a64.o +common-obj-$(CONFIG_ARM_A64_DIS) += libvixl/ +libvixldir = $(SRC_PATH)/disas/libvixl +arm-a64.o-cflags := -I$(libvixldir) common-obj-$(CONFIG_CRIS_DIS) += cris.o common-obj-$(CONFIG_HPPA_DIS) += hppa.o common-obj-$(CONFIG_I386_DIS) += i386.o diff --git a/disas/arm-a64.cc b/disas/arm-a64.cc new file mode 100644 index 0000000000..162be0c420 --- /dev/null +++ b/disas/arm-a64.cc @@ -0,0 +1,87 @@ +/* + * ARM A64 disassembly output wrapper to libvixl + * Copyright (c) 2013 Linaro Limited + * Written by Claudio Fontana + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "a64/disasm-a64.h" + +extern "C" { +#include "disas/bfd.h" +} + +using namespace vixl; + +static Decoder *vixl_decoder = NULL; +static Disassembler *vixl_disasm = NULL; + +/* We don't use libvixl's PrintDisassembler because its output + * is a little unhelpful (trailing newlines, for example). + * Instead we use our own very similar variant so we have + * control over the format. + */ +class QEMUDisassembler : public Disassembler { +public: + explicit QEMUDisassembler(FILE *stream) : stream_(stream) { } + ~QEMUDisassembler() { } + +protected: + void ProcessOutput(Instruction *instr) { + fprintf(stream_, "%08" PRIx32 " %s", + instr->InstructionBits(), GetOutput()); + } + +private: + FILE *stream_; +}; + +static int vixl_is_initialized(void) +{ + return vixl_decoder != NULL; +} + +static void vixl_init(FILE *f) { + vixl_decoder = new Decoder(); + vixl_disasm = new QEMUDisassembler(f); + vixl_decoder->AppendVisitor(vixl_disasm); +} + +#define INSN_SIZE 4 + +/* Disassemble ARM A64 instruction. This is our only entry + * point from QEMU's C code. + */ +int print_insn_arm_a64(uint64_t addr, disassemble_info *info) +{ + uint8_t bytes[INSN_SIZE]; + uint32_t instr; + int status; + + status = info->read_memory_func(addr, bytes, INSN_SIZE, info); + if (status != 0) { + info->memory_error_func(status, addr, info); + return -1; + } + + if (!vixl_is_initialized()) { + vixl_init(info->stream); + } + + instr = bytes[0] | bytes[1] << 8 | bytes[2] << 16 | bytes[3] << 24; + vixl_decoder->Decode(reinterpret_cast(&instr)); + + return INSN_SIZE; +} diff --git a/disas/i386.c b/disas/i386.c index 47f1f2ea61..00ceca9c51 100644 --- a/disas/i386.c +++ b/disas/i386.c @@ -171,6 +171,7 @@ static void print_operand_value (char *buf, size_t bufsize, int hex, bfd_vma dis static void print_displacement (char *, bfd_vma); static void OP_E (int, int); static void OP_G (int, int); +static void OP_vvvv (int, int); static bfd_vma get64 (void); static bfd_signed_vma get32 (void); static bfd_signed_vma get32s (void); @@ -264,6 +265,9 @@ static int rex_used; current instruction. */ static int used_prefixes; +/* The VEX.vvvv register, unencoded. */ +static int vex_reg; + /* Flags stored in PREFIXES. */ #define PREFIX_REPZ 1 #define PREFIX_REPNZ 2 @@ -278,6 +282,10 @@ static int used_prefixes; #define PREFIX_ADDR 0x400 #define PREFIX_FWAIT 0x800 +#define PREFIX_VEX_0F 0x1000 +#define PREFIX_VEX_0F38 0x2000 +#define PREFIX_VEX_0F3A 0x4000 + /* Make sure that bytes from INFO->PRIVATE_DATA->BUFFER (inclusive) to ADDR (exclusive) are valid. Returns 1 for success, longjmps on error. */ @@ -323,6 +331,7 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr) #define XX { NULL, 0 } +#define Bv { OP_vvvv, v_mode } #define Eb { OP_E, b_mode } #define Ev { OP_E, v_mode } #define Ed { OP_E, d_mode } @@ -671,7 +680,8 @@ fetch_data(struct disassemble_info *info, bfd_byte *addr) #define PREGRP102 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 102 } } #define PREGRP103 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 103 } } #define PREGRP104 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 104 } } - +#define PREGRP105 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 105 } } +#define PREGRP106 NULL, { { NULL, USE_PREFIX_USER_TABLE }, { NULL, 106 } } #define X86_64_0 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 0 } } #define X86_64_1 NULL, { { NULL, X86_64_SPECIAL }, { NULL, 1 } } @@ -1449,7 +1459,7 @@ static const unsigned char threebyte_0x38_uses_DATA_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -1473,7 +1483,7 @@ static const unsigned char threebyte_0x38_uses_REPNZ_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -1497,7 +1507,7 @@ static const unsigned char threebyte_0x38_uses_REPZ_prefix[256] = { /* c0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* cf */ /* d0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* df */ /* e0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ef */ - /* f0 */ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* ff */ + /* f0 */ 0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, /* ff */ /* ------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; @@ -2632,17 +2642,17 @@ static const struct dis386 prefix_user_table[][4] = { /* PREGRP87 */ { + { "movbe", { Gv, Ev } }, { "(bad)", { XX } }, - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { "movbe", { Gv, Ev } }, { "crc32", { Gdq, { CRC32_Fixup, b_mode } } }, }, /* PREGRP88 */ { + { "movbe", { Ev, Gv } }, { "(bad)", { XX } }, - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { "movbe", { Ev, Gv } }, { "crc32", { Gdq, { CRC32_Fixup, v_mode } } }, }, @@ -2774,6 +2784,22 @@ static const struct dis386 prefix_user_table[][4] = { { "(bad)", { XX } }, }, + /* PREGRP105 */ + { + { "andnS", { Gv, Bv, Ev } }, + { "(bad)", { XX } }, + { "(bad)", { XX } }, + { "(bad)", { XX } }, + }, + + /* PREGRP106 */ + { + { "bextrS", { Gv, Ev, Bv } }, + { "sarxS", { Gv, Ev, Bv } }, + { "shlxS", { Gv, Ev, Bv } }, + { "shrxS", { Gv, Ev, Bv } }, + }, + }; static const struct dis386 x86_64_table[][2] = { @@ -3071,12 +3097,12 @@ static const struct dis386 three_byte_table[][256] = { /* f0 */ { PREGRP87 }, { PREGRP88 }, + { PREGRP105 }, { "(bad)", { XX } }, { "(bad)", { XX } }, { "(bad)", { XX } }, { "(bad)", { XX } }, - { "(bad)", { XX } }, - { "(bad)", { XX } }, + { PREGRP106 }, /* f8 */ { "(bad)", { XX } }, { "(bad)", { XX } }, @@ -3477,6 +3503,74 @@ ckprefix (void) } } +static void +ckvexprefix (void) +{ + int op, vex2, vex3, newrex = 0, newpfx = prefixes; + + if (address_mode == mode_16bit) { + return; + } + + fetch_data(the_info, codep + 1); + op = *codep; + + if (op != 0xc4 && op != 0xc5) { + return; + } + + fetch_data(the_info, codep + 2); + vex2 = codep[1]; + + if (address_mode == mode_32bit && (vex2 & 0xc0) != 0xc0) { + return; + } + + if (op == 0xc4) { + /* Three byte VEX prefix. */ + fetch_data(the_info, codep + 3); + vex3 = codep[2]; + + newrex |= (vex2 & 0x80 ? 0 : REX_R); + newrex |= (vex2 & 0x40 ? 0 : REX_X); + newrex |= (vex2 & 0x20 ? 0 : REX_B); + newrex |= (vex3 & 0x80 ? REX_W : 0); + switch (vex2 & 0x1f) { /* VEX.m-mmmm */ + case 1: + newpfx |= PREFIX_VEX_0F; + break; + case 2: + newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F38; + break; + case 3: + newpfx |= PREFIX_VEX_0F | PREFIX_VEX_0F3A; + break; + } + vex2 = vex3; + codep += 3; + } else { + /* Two byte VEX prefix. */ + newrex |= (vex2 & 0x80 ? 0 : REX_R); + codep += 2; + } + + vex_reg = (~vex2 >> 3) & 15; /* VEX.vvvv */ + switch (vex2 & 3) { /* VEX.pp */ + case 1: + newpfx |= PREFIX_DATA; /* 0x66 */ + break; + case 2: + newpfx |= PREFIX_REPZ; /* 0xf3 */ + break; + case 3: + newpfx |= PREFIX_REPNZ; /* 0xf2 */ + break; + } + + rex = newrex; + prefixes = newpfx; +} + /* Return the name of the prefix byte PREF, or NULL if PREF is not a prefix byte. */ @@ -3598,6 +3692,7 @@ print_insn (bfd_vma pc, disassemble_info *info) const char *p; struct dis_private priv; unsigned char op; + unsigned char threebyte; if (info->mach == bfd_mach_x86_64_intel_syntax || info->mach == bfd_mach_x86_64) @@ -3752,6 +3847,7 @@ print_insn (bfd_vma pc, disassemble_info *info) obufp = obuf; ckprefix (); + ckvexprefix (); insn_codep = codep; sizeflag = priv.orig_sizeflag; @@ -3775,18 +3871,29 @@ print_insn (bfd_vma pc, disassemble_info *info) } op = 0; + if (prefixes & PREFIX_VEX_0F) + { + used_prefixes |= PREFIX_VEX_0F | PREFIX_VEX_0F38 | PREFIX_VEX_0F3A; + if (prefixes & PREFIX_VEX_0F38) + threebyte = 0x38; + else if (prefixes & PREFIX_VEX_0F3A) + threebyte = 0x3a; + else + threebyte = *codep++; + goto vex_opcode; + } if (*codep == 0x0f) { - unsigned char threebyte; fetch_data(info, codep + 2); - threebyte = *++codep; + threebyte = codep[1]; + codep += 2; + vex_opcode: dp = &dis386_twobyte[threebyte]; - need_modrm = twobyte_has_modrm[*codep]; - uses_DATA_prefix = twobyte_uses_DATA_prefix[*codep]; - uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[*codep]; - uses_REPZ_prefix = twobyte_uses_REPZ_prefix[*codep]; - uses_LOCK_prefix = (*codep & ~0x02) == 0x20; - codep++; + need_modrm = twobyte_has_modrm[threebyte]; + uses_DATA_prefix = twobyte_uses_DATA_prefix[threebyte]; + uses_REPNZ_prefix = twobyte_uses_REPNZ_prefix[threebyte]; + uses_REPZ_prefix = twobyte_uses_REPZ_prefix[threebyte]; + uses_LOCK_prefix = (threebyte & ~0x02) == 0x20; if (dp->name == NULL && dp->op[0].bytemode == IS_3BYTE_OPCODE) { fetch_data(info, codep + 2); @@ -5291,6 +5398,17 @@ OP_G (int bytemode, int sizeflag) } } +static void +OP_vvvv (int bytemode, int sizeflags) +{ + USED_REX (REX_W); + if (rex & REX_W) { + oappend(names64[vex_reg]); + } else { + oappend(names32[vex_reg]); + } +} + static bfd_vma get64 (void) { diff --git a/disas/libvixl/LICENCE b/disas/libvixl/LICENCE new file mode 100644 index 0000000000..b7e160a3f5 --- /dev/null +++ b/disas/libvixl/LICENCE @@ -0,0 +1,30 @@ +LICENCE +======= + +The software in this repository is covered by the following licence. + +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/disas/libvixl/Makefile.objs b/disas/libvixl/Makefile.objs new file mode 100644 index 0000000000..0adb3ced7b --- /dev/null +++ b/disas/libvixl/Makefile.objs @@ -0,0 +1,8 @@ +libvixl_OBJS = utils.o \ + a64/instructions-a64.o \ + a64/decoder-a64.o \ + a64/disasm-a64.o + +$(addprefix $(obj)/,$(libvixl_OBJS)): QEMU_CFLAGS += -I$(SRC_PATH)/disas/libvixl + +common-obj-$(CONFIG_ARM_A64_DIS) += $(libvixl_OBJS) diff --git a/disas/libvixl/README b/disas/libvixl/README new file mode 100644 index 0000000000..96814a5dc1 --- /dev/null +++ b/disas/libvixl/README @@ -0,0 +1,12 @@ + +The code in this directory is a subset of libvixl: + https://github.com/armvixl/vixl +(specifically, it is the set of files needed for disassembly only, +taken from libvixl 1.1). +Bugfixes should preferably be sent upstream initially. + +The disassembler does not currently support the entire A64 instruction +set. Notably: + * No Advanced SIMD support. + * Limited support for system instructions. + * A few miscellaneous integer and floating point instructions are missing. diff --git a/disas/libvixl/a64/assembler-a64.h b/disas/libvixl/a64/assembler-a64.h new file mode 100644 index 0000000000..1e2947b283 --- /dev/null +++ b/disas/libvixl/a64/assembler-a64.h @@ -0,0 +1,1895 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_A64_ASSEMBLER_A64_H_ +#define VIXL_A64_ASSEMBLER_A64_H_ + +#include + +#include "globals.h" +#include "utils.h" +#include "a64/instructions-a64.h" + +namespace vixl { + +typedef uint64_t RegList; +static const int kRegListSizeInBits = sizeof(RegList) * 8; + + +// Registers. + +// Some CPURegister methods can return Register and FPRegister types, so we +// need to declare them in advance. +class Register; +class FPRegister; + + +class CPURegister { + public: + enum RegisterType { + // The kInvalid value is used to detect uninitialized static instances, + // which are always zero-initialized before any constructors are called. + kInvalid = 0, + kRegister, + kFPRegister, + kNoRegister + }; + + CPURegister() : code_(0), size_(0), type_(kNoRegister) { + VIXL_ASSERT(!IsValid()); + VIXL_ASSERT(IsNone()); + } + + CPURegister(unsigned code, unsigned size, RegisterType type) + : code_(code), size_(size), type_(type) { + VIXL_ASSERT(IsValidOrNone()); + } + + unsigned code() const { + VIXL_ASSERT(IsValid()); + return code_; + } + + RegisterType type() const { + VIXL_ASSERT(IsValidOrNone()); + return type_; + } + + RegList Bit() const { + VIXL_ASSERT(code_ < (sizeof(RegList) * 8)); + return IsValid() ? (static_cast(1) << code_) : 0; + } + + unsigned size() const { + VIXL_ASSERT(IsValid()); + return size_; + } + + int SizeInBytes() const { + VIXL_ASSERT(IsValid()); + VIXL_ASSERT(size() % 8 == 0); + return size_ / 8; + } + + int SizeInBits() const { + VIXL_ASSERT(IsValid()); + return size_; + } + + bool Is32Bits() const { + VIXL_ASSERT(IsValid()); + return size_ == 32; + } + + bool Is64Bits() const { + VIXL_ASSERT(IsValid()); + return size_ == 64; + } + + bool IsValid() const { + if (IsValidRegister() || IsValidFPRegister()) { + VIXL_ASSERT(!IsNone()); + return true; + } else { + VIXL_ASSERT(IsNone()); + return false; + } + } + + bool IsValidRegister() const { + return IsRegister() && + ((size_ == kWRegSize) || (size_ == kXRegSize)) && + ((code_ < kNumberOfRegisters) || (code_ == kSPRegInternalCode)); + } + + bool IsValidFPRegister() const { + return IsFPRegister() && + ((size_ == kSRegSize) || (size_ == kDRegSize)) && + (code_ < kNumberOfFPRegisters); + } + + bool IsNone() const { + // kNoRegister types should always have size 0 and code 0. + VIXL_ASSERT((type_ != kNoRegister) || (code_ == 0)); + VIXL_ASSERT((type_ != kNoRegister) || (size_ == 0)); + + return type_ == kNoRegister; + } + + bool Aliases(const CPURegister& other) const { + VIXL_ASSERT(IsValidOrNone() && other.IsValidOrNone()); + return (code_ == other.code_) && (type_ == other.type_); + } + + bool Is(const CPURegister& other) const { + VIXL_ASSERT(IsValidOrNone() && other.IsValidOrNone()); + return Aliases(other) && (size_ == other.size_); + } + + inline bool IsZero() const { + VIXL_ASSERT(IsValid()); + return IsRegister() && (code_ == kZeroRegCode); + } + + inline bool IsSP() const { + VIXL_ASSERT(IsValid()); + return IsRegister() && (code_ == kSPRegInternalCode); + } + + inline bool IsRegister() const { + return type_ == kRegister; + } + + inline bool IsFPRegister() const { + return type_ == kFPRegister; + } + + const Register& W() const; + const Register& X() const; + const FPRegister& S() const; + const FPRegister& D() const; + + inline bool IsSameSizeAndType(const CPURegister& other) const { + return (size_ == other.size_) && (type_ == other.type_); + } + + protected: + unsigned code_; + unsigned size_; + RegisterType type_; + + private: + bool IsValidOrNone() const { + return IsValid() || IsNone(); + } +}; + + +class Register : public CPURegister { + public: + explicit Register() : CPURegister() {} + inline explicit Register(const CPURegister& other) + : CPURegister(other.code(), other.size(), other.type()) { + VIXL_ASSERT(IsValidRegister()); + } + explicit Register(unsigned code, unsigned size) + : CPURegister(code, size, kRegister) {} + + bool IsValid() const { + VIXL_ASSERT(IsRegister() || IsNone()); + return IsValidRegister(); + } + + static const Register& WRegFromCode(unsigned code); + static const Register& XRegFromCode(unsigned code); + + // V8 compatibility. + static const int kNumRegisters = kNumberOfRegisters; + static const int kNumAllocatableRegisters = kNumberOfRegisters - 1; + + private: + static const Register wregisters[]; + static const Register xregisters[]; +}; + + +class FPRegister : public CPURegister { + public: + inline FPRegister() : CPURegister() {} + inline explicit FPRegister(const CPURegister& other) + : CPURegister(other.code(), other.size(), other.type()) { + VIXL_ASSERT(IsValidFPRegister()); + } + inline FPRegister(unsigned code, unsigned size) + : CPURegister(code, size, kFPRegister) {} + + bool IsValid() const { + VIXL_ASSERT(IsFPRegister() || IsNone()); + return IsValidFPRegister(); + } + + static const FPRegister& SRegFromCode(unsigned code); + static const FPRegister& DRegFromCode(unsigned code); + + // V8 compatibility. + static const int kNumRegisters = kNumberOfFPRegisters; + static const int kNumAllocatableRegisters = kNumberOfFPRegisters - 1; + + private: + static const FPRegister sregisters[]; + static const FPRegister dregisters[]; +}; + + +// No*Reg is used to indicate an unused argument, or an error case. Note that +// these all compare equal (using the Is() method). The Register and FPRegister +// variants are provided for convenience. +const Register NoReg; +const FPRegister NoFPReg; +const CPURegister NoCPUReg; + + +#define DEFINE_REGISTERS(N) \ +const Register w##N(N, kWRegSize); \ +const Register x##N(N, kXRegSize); +REGISTER_CODE_LIST(DEFINE_REGISTERS) +#undef DEFINE_REGISTERS +const Register wsp(kSPRegInternalCode, kWRegSize); +const Register sp(kSPRegInternalCode, kXRegSize); + + +#define DEFINE_FPREGISTERS(N) \ +const FPRegister s##N(N, kSRegSize); \ +const FPRegister d##N(N, kDRegSize); +REGISTER_CODE_LIST(DEFINE_FPREGISTERS) +#undef DEFINE_FPREGISTERS + + +// Registers aliases. +const Register ip0 = x16; +const Register ip1 = x17; +const Register lr = x30; +const Register xzr = x31; +const Register wzr = w31; + + +// AreAliased returns true if any of the named registers overlap. Arguments +// set to NoReg are ignored. The system stack pointer may be specified. +bool AreAliased(const CPURegister& reg1, + const CPURegister& reg2, + const CPURegister& reg3 = NoReg, + const CPURegister& reg4 = NoReg, + const CPURegister& reg5 = NoReg, + const CPURegister& reg6 = NoReg, + const CPURegister& reg7 = NoReg, + const CPURegister& reg8 = NoReg); + + +// AreSameSizeAndType returns true if all of the specified registers have the +// same size, and are of the same type. The system stack pointer may be +// specified. Arguments set to NoReg are ignored, as are any subsequent +// arguments. At least one argument (reg1) must be valid (not NoCPUReg). +bool AreSameSizeAndType(const CPURegister& reg1, + const CPURegister& reg2, + const CPURegister& reg3 = NoCPUReg, + const CPURegister& reg4 = NoCPUReg, + const CPURegister& reg5 = NoCPUReg, + const CPURegister& reg6 = NoCPUReg, + const CPURegister& reg7 = NoCPUReg, + const CPURegister& reg8 = NoCPUReg); + + +// Lists of registers. +class CPURegList { + public: + inline explicit CPURegList(CPURegister reg1, + CPURegister reg2 = NoCPUReg, + CPURegister reg3 = NoCPUReg, + CPURegister reg4 = NoCPUReg) + : list_(reg1.Bit() | reg2.Bit() | reg3.Bit() | reg4.Bit()), + size_(reg1.size()), type_(reg1.type()) { + VIXL_ASSERT(AreSameSizeAndType(reg1, reg2, reg3, reg4)); + VIXL_ASSERT(IsValid()); + } + + inline CPURegList(CPURegister::RegisterType type, unsigned size, RegList list) + : list_(list), size_(size), type_(type) { + VIXL_ASSERT(IsValid()); + } + + inline CPURegList(CPURegister::RegisterType type, unsigned size, + unsigned first_reg, unsigned last_reg) + : size_(size), type_(type) { + VIXL_ASSERT(((type == CPURegister::kRegister) && + (last_reg < kNumberOfRegisters)) || + ((type == CPURegister::kFPRegister) && + (last_reg < kNumberOfFPRegisters))); + VIXL_ASSERT(last_reg >= first_reg); + list_ = (UINT64_C(1) << (last_reg + 1)) - 1; + list_ &= ~((UINT64_C(1) << first_reg) - 1); + VIXL_ASSERT(IsValid()); + } + + inline CPURegister::RegisterType type() const { + VIXL_ASSERT(IsValid()); + return type_; + } + + // Combine another CPURegList into this one. Registers that already exist in + // this list are left unchanged. The type and size of the registers in the + // 'other' list must match those in this list. + void Combine(const CPURegList& other) { + VIXL_ASSERT(IsValid()); + VIXL_ASSERT(other.type() == type_); + VIXL_ASSERT(other.RegisterSizeInBits() == size_); + list_ |= other.list(); + } + + // Remove every register in the other CPURegList from this one. Registers that + // do not exist in this list are ignored. The type and size of the registers + // in the 'other' list must match those in this list. + void Remove(const CPURegList& other) { + VIXL_ASSERT(IsValid()); + VIXL_ASSERT(other.type() == type_); + VIXL_ASSERT(other.RegisterSizeInBits() == size_); + list_ &= ~other.list(); + } + + // Variants of Combine and Remove which take a single register. + inline void Combine(const CPURegister& other) { + VIXL_ASSERT(other.type() == type_); + VIXL_ASSERT(other.size() == size_); + Combine(other.code()); + } + + inline void Remove(const CPURegister& other) { + VIXL_ASSERT(other.type() == type_); + VIXL_ASSERT(other.size() == size_); + Remove(other.code()); + } + + // Variants of Combine and Remove which take a single register by its code; + // the type and size of the register is inferred from this list. + inline void Combine(int code) { + VIXL_ASSERT(IsValid()); + VIXL_ASSERT(CPURegister(code, size_, type_).IsValid()); + list_ |= (UINT64_C(1) << code); + } + + inline void Remove(int code) { + VIXL_ASSERT(IsValid()); + VIXL_ASSERT(CPURegister(code, size_, type_).IsValid()); + list_ &= ~(UINT64_C(1) << code); + } + + inline RegList list() const { + VIXL_ASSERT(IsValid()); + return list_; + } + + inline void set_list(RegList new_list) { + VIXL_ASSERT(IsValid()); + list_ = new_list; + } + + // Remove all callee-saved registers from the list. This can be useful when + // preparing registers for an AAPCS64 function call, for example. + void RemoveCalleeSaved(); + + CPURegister PopLowestIndex(); + CPURegister PopHighestIndex(); + + // AAPCS64 callee-saved registers. + static CPURegList GetCalleeSaved(unsigned size = kXRegSize); + static CPURegList GetCalleeSavedFP(unsigned size = kDRegSize); + + // AAPCS64 caller-saved registers. Note that this includes lr. + static CPURegList GetCallerSaved(unsigned size = kXRegSize); + static CPURegList GetCallerSavedFP(unsigned size = kDRegSize); + + inline bool IsEmpty() const { + VIXL_ASSERT(IsValid()); + return list_ == 0; + } + + inline bool IncludesAliasOf(const CPURegister& other) const { + VIXL_ASSERT(IsValid()); + return (type_ == other.type()) && ((other.Bit() & list_) != 0); + } + + inline bool IncludesAliasOf(int code) const { + VIXL_ASSERT(IsValid()); + return ((code & list_) != 0); + } + + inline int Count() const { + VIXL_ASSERT(IsValid()); + return CountSetBits(list_, kRegListSizeInBits); + } + + inline unsigned RegisterSizeInBits() const { + VIXL_ASSERT(IsValid()); + return size_; + } + + inline unsigned RegisterSizeInBytes() const { + int size_in_bits = RegisterSizeInBits(); + VIXL_ASSERT((size_in_bits % 8) == 0); + return size_in_bits / 8; + } + + inline unsigned TotalSizeInBytes() const { + VIXL_ASSERT(IsValid()); + return RegisterSizeInBytes() * Count(); + } + + private: + RegList list_; + unsigned size_; + CPURegister::RegisterType type_; + + bool IsValid() const; +}; + + +// AAPCS64 callee-saved registers. +extern const CPURegList kCalleeSaved; +extern const CPURegList kCalleeSavedFP; + + +// AAPCS64 caller-saved registers. Note that this includes lr. +extern const CPURegList kCallerSaved; +extern const CPURegList kCallerSavedFP; + + +// Operand. +class Operand { + public: + // # + // where is int64_t. + // This is allowed to be an implicit constructor because Operand is + // a wrapper class that doesn't normally perform any type conversion. + Operand(int64_t immediate); // NOLINT(runtime/explicit) + + // rm, { #} + // where is one of {LSL, LSR, ASR, ROR}. + // is uint6_t. + // This is allowed to be an implicit constructor because Operand is + // a wrapper class that doesn't normally perform any type conversion. + Operand(Register reg, + Shift shift = LSL, + unsigned shift_amount = 0); // NOLINT(runtime/explicit) + + // rm, { {#}} + // where is one of {UXTB, UXTH, UXTW, UXTX, SXTB, SXTH, SXTW, SXTX}. + // is uint2_t. + explicit Operand(Register reg, Extend extend, unsigned shift_amount = 0); + + bool IsImmediate() const; + bool IsShiftedRegister() const; + bool IsExtendedRegister() const; + bool IsZero() const; + + // This returns an LSL shift (<= 4) operand as an equivalent extend operand, + // which helps in the encoding of instructions that use the stack pointer. + Operand ToExtendedRegister() const; + + int64_t immediate() const { + VIXL_ASSERT(IsImmediate()); + return immediate_; + } + + Register reg() const { + VIXL_ASSERT(IsShiftedRegister() || IsExtendedRegister()); + return reg_; + } + + Shift shift() const { + VIXL_ASSERT(IsShiftedRegister()); + return shift_; + } + + Extend extend() const { + VIXL_ASSERT(IsExtendedRegister()); + return extend_; + } + + unsigned shift_amount() const { + VIXL_ASSERT(IsShiftedRegister() || IsExtendedRegister()); + return shift_amount_; + } + + private: + int64_t immediate_; + Register reg_; + Shift shift_; + Extend extend_; + unsigned shift_amount_; +}; + + +// MemOperand represents the addressing mode of a load or store instruction. +class MemOperand { + public: + explicit MemOperand(Register base, + ptrdiff_t offset = 0, + AddrMode addrmode = Offset); + explicit MemOperand(Register base, + Register regoffset, + Shift shift = LSL, + unsigned shift_amount = 0); + explicit MemOperand(Register base, + Register regoffset, + Extend extend, + unsigned shift_amount = 0); + explicit MemOperand(Register base, + const Operand& offset, + AddrMode addrmode = Offset); + + const Register& base() const { return base_; } + const Register& regoffset() const { return regoffset_; } + ptrdiff_t offset() const { return offset_; } + AddrMode addrmode() const { return addrmode_; } + Shift shift() const { return shift_; } + Extend extend() const { return extend_; } + unsigned shift_amount() const { return shift_amount_; } + bool IsImmediateOffset() const; + bool IsRegisterOffset() const; + bool IsPreIndex() const; + bool IsPostIndex() const; + + private: + Register base_; + Register regoffset_; + ptrdiff_t offset_; + AddrMode addrmode_; + Shift shift_; + Extend extend_; + unsigned shift_amount_; +}; + + +class Label { + public: + Label() : is_bound_(false), link_(NULL), target_(NULL) {} + ~Label() { + // If the label has been linked to, it needs to be bound to a target. + VIXL_ASSERT(!IsLinked() || IsBound()); + } + + inline Instruction* link() const { return link_; } + inline Instruction* target() const { return target_; } + + inline bool IsBound() const { return is_bound_; } + inline bool IsLinked() const { return link_ != NULL; } + + inline void set_link(Instruction* new_link) { link_ = new_link; } + + static const int kEndOfChain = 0; + + private: + // Indicates if the label has been bound, ie its location is fixed. + bool is_bound_; + // Branches instructions branching to this label form a chained list, with + // their offset indicating where the next instruction is located. + // link_ points to the latest branch instruction generated branching to this + // branch. + // If link_ is not NULL, the label has been linked to. + Instruction* link_; + // The label location. + Instruction* target_; + + friend class Assembler; +}; + + +// TODO: Obtain better values for these, based on real-world data. +const int kLiteralPoolCheckInterval = 4 * KBytes; +const int kRecommendedLiteralPoolRange = 2 * kLiteralPoolCheckInterval; + + +// Control whether a branch over the literal pool should also be emitted. This +// is needed if the literal pool has to be emitted in the middle of the JITted +// code. +enum LiteralPoolEmitOption { + JumpRequired, + NoJumpRequired +}; + + +// Literal pool entry. +class Literal { + public: + Literal(Instruction* pc, uint64_t imm, unsigned size) + : pc_(pc), value_(imm), size_(size) {} + + private: + Instruction* pc_; + int64_t value_; + unsigned size_; + + friend class Assembler; +}; + + +// Assembler. +class Assembler { + public: + Assembler(byte* buffer, unsigned buffer_size); + + // The destructor asserts that one of the following is true: + // * The Assembler object has not been used. + // * Nothing has been emitted since the last Reset() call. + // * Nothing has been emitted since the last FinalizeCode() call. + ~Assembler(); + + // System functions. + + // Start generating code from the beginning of the buffer, discarding any code + // and data that has already been emitted into the buffer. + // + // In order to avoid any accidental transfer of state, Reset ASSERTs that the + // constant pool is not blocked. + void Reset(); + + // Finalize a code buffer of generated instructions. This function must be + // called before executing or copying code from the buffer. + void FinalizeCode(); + + // Label. + // Bind a label to the current PC. + void bind(Label* label); + int UpdateAndGetByteOffsetTo(Label* label); + inline int UpdateAndGetInstructionOffsetTo(Label* label) { + VIXL_ASSERT(Label::kEndOfChain == 0); + return UpdateAndGetByteOffsetTo(label) >> kInstructionSizeLog2; + } + + + // Instruction set functions. + + // Branch / Jump instructions. + // Branch to register. + void br(const Register& xn); + + // Branch with link to register. + void blr(const Register& xn); + + // Branch to register with return hint. + void ret(const Register& xn = lr); + + // Unconditional branch to label. + void b(Label* label); + + // Conditional branch to label. + void b(Label* label, Condition cond); + + // Unconditional branch to PC offset. + void b(int imm26); + + // Conditional branch to PC offset. + void b(int imm19, Condition cond); + + // Branch with link to label. + void bl(Label* label); + + // Branch with link to PC offset. + void bl(int imm26); + + // Compare and branch to label if zero. + void cbz(const Register& rt, Label* label); + + // Compare and branch to PC offset if zero. + void cbz(const Register& rt, int imm19); + + // Compare and branch to label if not zero. + void cbnz(const Register& rt, Label* label); + + // Compare and branch to PC offset if not zero. + void cbnz(const Register& rt, int imm19); + + // Test bit and branch to label if zero. + void tbz(const Register& rt, unsigned bit_pos, Label* label); + + // Test bit and branch to PC offset if zero. + void tbz(const Register& rt, unsigned bit_pos, int imm14); + + // Test bit and branch to label if not zero. + void tbnz(const Register& rt, unsigned bit_pos, Label* label); + + // Test bit and branch to PC offset if not zero. + void tbnz(const Register& rt, unsigned bit_pos, int imm14); + + // Address calculation instructions. + // Calculate a PC-relative address. Unlike for branches the offset in adr is + // unscaled (i.e. the result can be unaligned). + + // Calculate the address of a label. + void adr(const Register& rd, Label* label); + + // Calculate the address of a PC offset. + void adr(const Register& rd, int imm21); + + // Data Processing instructions. + // Add. + void add(const Register& rd, + const Register& rn, + const Operand& operand); + + // Add and update status flags. + void adds(const Register& rd, + const Register& rn, + const Operand& operand); + + // Compare negative. + void cmn(const Register& rn, const Operand& operand); + + // Subtract. + void sub(const Register& rd, + const Register& rn, + const Operand& operand); + + // Subtract and update status flags. + void subs(const Register& rd, + const Register& rn, + const Operand& operand); + + // Compare. + void cmp(const Register& rn, const Operand& operand); + + // Negate. + void neg(const Register& rd, + const Operand& operand); + + // Negate and update status flags. + void negs(const Register& rd, + const Operand& operand); + + // Add with carry bit. + void adc(const Register& rd, + const Register& rn, + const Operand& operand); + + // Add with carry bit and update status flags. + void adcs(const Register& rd, + const Register& rn, + const Operand& operand); + + // Subtract with carry bit. + void sbc(const Register& rd, + const Register& rn, + const Operand& operand); + + // Subtract with carry bit and update status flags. + void sbcs(const Register& rd, + const Register& rn, + const Operand& operand); + + // Negate with carry bit. + void ngc(const Register& rd, + const Operand& operand); + + // Negate with carry bit and update status flags. + void ngcs(const Register& rd, + const Operand& operand); + + // Logical instructions. + // Bitwise and (A & B). + void and_(const Register& rd, + const Register& rn, + const Operand& operand); + + // Bitwise and (A & B) and update status flags. + void ands(const Register& rd, + const Register& rn, + const Operand& operand); + + // Bit test and set flags. + void tst(const Register& rn, const Operand& operand); + + // Bit clear (A & ~B). + void bic(const Register& rd, + const Register& rn, + const Operand& operand); + + // Bit clear (A & ~B) and update status flags. + void bics(const Register& rd, + const Register& rn, + const Operand& operand); + + // Bitwise or (A | B). + void orr(const Register& rd, const Register& rn, const Operand& operand); + + // Bitwise nor (A | ~B). + void orn(const Register& rd, const Register& rn, const Operand& operand); + + // Bitwise eor/xor (A ^ B). + void eor(const Register& rd, const Register& rn, const Operand& operand); + + // Bitwise enor/xnor (A ^ ~B). + void eon(const Register& rd, const Register& rn, const Operand& operand); + + // Logical shift left by variable. + void lslv(const Register& rd, const Register& rn, const Register& rm); + + // Logical shift right by variable. + void lsrv(const Register& rd, const Register& rn, const Register& rm); + + // Arithmetic shift right by variable. + void asrv(const Register& rd, const Register& rn, const Register& rm); + + // Rotate right by variable. + void rorv(const Register& rd, const Register& rn, const Register& rm); + + // Bitfield instructions. + // Bitfield move. + void bfm(const Register& rd, + const Register& rn, + unsigned immr, + unsigned imms); + + // Signed bitfield move. + void sbfm(const Register& rd, + const Register& rn, + unsigned immr, + unsigned imms); + + // Unsigned bitfield move. + void ubfm(const Register& rd, + const Register& rn, + unsigned immr, + unsigned imms); + + // Bfm aliases. + // Bitfield insert. + inline void bfi(const Register& rd, + const Register& rn, + unsigned lsb, + unsigned width) { + VIXL_ASSERT(width >= 1); + VIXL_ASSERT(lsb + width <= rn.size()); + bfm(rd, rn, (rd.size() - lsb) & (rd.size() - 1), width - 1); + } + + // Bitfield extract and insert low. + inline void bfxil(const Register& rd, + const Register& rn, + unsigned lsb, + unsigned width) { + VIXL_ASSERT(width >= 1); + VIXL_ASSERT(lsb + width <= rn.size()); + bfm(rd, rn, lsb, lsb + width - 1); + } + + // Sbfm aliases. + // Arithmetic shift right. + inline void asr(const Register& rd, const Register& rn, unsigned shift) { + VIXL_ASSERT(shift < rd.size()); + sbfm(rd, rn, shift, rd.size() - 1); + } + + // Signed bitfield insert with zero at right. + inline void sbfiz(const Register& rd, + const Register& rn, + unsigned lsb, + unsigned width) { + VIXL_ASSERT(width >= 1); + VIXL_ASSERT(lsb + width <= rn.size()); + sbfm(rd, rn, (rd.size() - lsb) & (rd.size() - 1), width - 1); + } + + // Signed bitfield extract. + inline void sbfx(const Register& rd, + const Register& rn, + unsigned lsb, + unsigned width) { + VIXL_ASSERT(width >= 1); + VIXL_ASSERT(lsb + width <= rn.size()); + sbfm(rd, rn, lsb, lsb + width - 1); + } + + // Signed extend byte. + inline void sxtb(const Register& rd, const Register& rn) { + sbfm(rd, rn, 0, 7); + } + + // Signed extend halfword. + inline void sxth(const Register& rd, const Register& rn) { + sbfm(rd, rn, 0, 15); + } + + // Signed extend word. + inline void sxtw(const Register& rd, const Register& rn) { + sbfm(rd, rn, 0, 31); + } + + // Ubfm aliases. + // Logical shift left. + inline void lsl(const Register& rd, const Register& rn, unsigned shift) { + unsigned reg_size = rd.size(); + VIXL_ASSERT(shift < reg_size); + ubfm(rd, rn, (reg_size - shift) % reg_size, reg_size - shift - 1); + } + + // Logical shift right. + inline void lsr(const Register& rd, const Register& rn, unsigned shift) { + VIXL_ASSERT(shift < rd.size()); + ubfm(rd, rn, shift, rd.size() - 1); + } + + // Unsigned bitfield insert with zero at right. + inline void ubfiz(const Register& rd, + const Register& rn, + unsigned lsb, + unsigned width) { + VIXL_ASSERT(width >= 1); + VIXL_ASSERT(lsb + width <= rn.size()); + ubfm(rd, rn, (rd.size() - lsb) & (rd.size() - 1), width - 1); + } + + // Unsigned bitfield extract. + inline void ubfx(const Register& rd, + const Register& rn, + unsigned lsb, + unsigned width) { + VIXL_ASSERT(width >= 1); + VIXL_ASSERT(lsb + width <= rn.size()); + ubfm(rd, rn, lsb, lsb + width - 1); + } + + // Unsigned extend byte. + inline void uxtb(const Register& rd, const Register& rn) { + ubfm(rd, rn, 0, 7); + } + + // Unsigned extend halfword. + inline void uxth(const Register& rd, const Register& rn) { + ubfm(rd, rn, 0, 15); + } + + // Unsigned extend word. + inline void uxtw(const Register& rd, const Register& rn) { + ubfm(rd, rn, 0, 31); + } + + // Extract. + void extr(const Register& rd, + const Register& rn, + const Register& rm, + unsigned lsb); + + // Conditional select: rd = cond ? rn : rm. + void csel(const Register& rd, + const Register& rn, + const Register& rm, + Condition cond); + + // Conditional select increment: rd = cond ? rn : rm + 1. + void csinc(const Register& rd, + const Register& rn, + const Register& rm, + Condition cond); + + // Conditional select inversion: rd = cond ? rn : ~rm. + void csinv(const Register& rd, + const Register& rn, + const Register& rm, + Condition cond); + + // Conditional select negation: rd = cond ? rn : -rm. + void csneg(const Register& rd, + const Register& rn, + const Register& rm, + Condition cond); + + // Conditional set: rd = cond ? 1 : 0. + void cset(const Register& rd, Condition cond); + + // Conditional set mask: rd = cond ? -1 : 0. + void csetm(const Register& rd, Condition cond); + + // Conditional increment: rd = cond ? rn + 1 : rn. + void cinc(const Register& rd, const Register& rn, Condition cond); + + // Conditional invert: rd = cond ? ~rn : rn. + void cinv(const Register& rd, const Register& rn, Condition cond); + + // Conditional negate: rd = cond ? -rn : rn. + void cneg(const Register& rd, const Register& rn, Condition cond); + + // Rotate right. + inline void ror(const Register& rd, const Register& rs, unsigned shift) { + extr(rd, rs, rs, shift); + } + + // Conditional comparison. + // Conditional compare negative. + void ccmn(const Register& rn, + const Operand& operand, + StatusFlags nzcv, + Condition cond); + + // Conditional compare. + void ccmp(const Register& rn, + const Operand& operand, + StatusFlags nzcv, + Condition cond); + + // Multiply. + void mul(const Register& rd, const Register& rn, const Register& rm); + + // Negated multiply. + void mneg(const Register& rd, const Register& rn, const Register& rm); + + // Signed long multiply: 32 x 32 -> 64-bit. + void smull(const Register& rd, const Register& rn, const Register& rm); + + // Signed multiply high: 64 x 64 -> 64-bit <127:64>. + void smulh(const Register& xd, const Register& xn, const Register& xm); + + // Multiply and accumulate. + void madd(const Register& rd, + const Register& rn, + const Register& rm, + const Register& ra); + + // Multiply and subtract. + void msub(const Register& rd, + const Register& rn, + const Register& rm, + const Register& ra); + + // Signed long multiply and accumulate: 32 x 32 + 64 -> 64-bit. + void smaddl(const Register& rd, + const Register& rn, + const Register& rm, + const Register& ra); + + // Unsigned long multiply and accumulate: 32 x 32 + 64 -> 64-bit. + void umaddl(const Register& rd, + const Register& rn, + const Register& rm, + const Register& ra); + + // Signed long multiply and subtract: 64 - (32 x 32) -> 64-bit. + void smsubl(const Register& rd, + const Register& rn, + const Register& rm, + const Register& ra); + + // Unsigned long multiply and subtract: 64 - (32 x 32) -> 64-bit. + void umsubl(const Register& rd, + const Register& rn, + const Register& rm, + const Register& ra); + + // Signed integer divide. + void sdiv(const Register& rd, const Register& rn, const Register& rm); + + // Unsigned integer divide. + void udiv(const Register& rd, const Register& rn, const Register& rm); + + // Bit reverse. + void rbit(const Register& rd, const Register& rn); + + // Reverse bytes in 16-bit half words. + void rev16(const Register& rd, const Register& rn); + + // Reverse bytes in 32-bit words. + void rev32(const Register& rd, const Register& rn); + + // Reverse bytes. + void rev(const Register& rd, const Register& rn); + + // Count leading zeroes. + void clz(const Register& rd, const Register& rn); + + // Count leading sign bits. + void cls(const Register& rd, const Register& rn); + + // Memory instructions. + // Load integer or FP register. + void ldr(const CPURegister& rt, const MemOperand& src); + + // Store integer or FP register. + void str(const CPURegister& rt, const MemOperand& dst); + + // Load word with sign extension. + void ldrsw(const Register& rt, const MemOperand& src); + + // Load byte. + void ldrb(const Register& rt, const MemOperand& src); + + // Store byte. + void strb(const Register& rt, const MemOperand& dst); + + // Load byte with sign extension. + void ldrsb(const Register& rt, const MemOperand& src); + + // Load half-word. + void ldrh(const Register& rt, const MemOperand& src); + + // Store half-word. + void strh(const Register& rt, const MemOperand& dst); + + // Load half-word with sign extension. + void ldrsh(const Register& rt, const MemOperand& src); + + // Load integer or FP register pair. + void ldp(const CPURegister& rt, const CPURegister& rt2, + const MemOperand& src); + + // Store integer or FP register pair. + void stp(const CPURegister& rt, const CPURegister& rt2, + const MemOperand& dst); + + // Load word pair with sign extension. + void ldpsw(const Register& rt, const Register& rt2, const MemOperand& src); + + // Load integer or FP register pair, non-temporal. + void ldnp(const CPURegister& rt, const CPURegister& rt2, + const MemOperand& src); + + // Store integer or FP register pair, non-temporal. + void stnp(const CPURegister& rt, const CPURegister& rt2, + const MemOperand& dst); + + // Load literal to register. + void ldr(const Register& rt, uint64_t imm); + + // Load double precision floating point literal to FP register. + void ldr(const FPRegister& ft, double imm); + + // Load single precision floating point literal to FP register. + void ldr(const FPRegister& ft, float imm); + + // Move instructions. The default shift of -1 indicates that the move + // instruction will calculate an appropriate 16-bit immediate and left shift + // that is equal to the 64-bit immediate argument. If an explicit left shift + // is specified (0, 16, 32 or 48), the immediate must be a 16-bit value. + // + // For movk, an explicit shift can be used to indicate which half word should + // be overwritten, eg. movk(x0, 0, 0) will overwrite the least-significant + // half word with zero, whereas movk(x0, 0, 48) will overwrite the + // most-significant. + + // Move immediate and keep. + void movk(const Register& rd, uint64_t imm, int shift = -1) { + MoveWide(rd, imm, shift, MOVK); + } + + // Move inverted immediate. + void movn(const Register& rd, uint64_t imm, int shift = -1) { + MoveWide(rd, imm, shift, MOVN); + } + + // Move immediate. + void movz(const Register& rd, uint64_t imm, int shift = -1) { + MoveWide(rd, imm, shift, MOVZ); + } + + // Misc instructions. + // Monitor debug-mode breakpoint. + void brk(int code); + + // Halting debug-mode breakpoint. + void hlt(int code); + + // Move register to register. + void mov(const Register& rd, const Register& rn); + + // Move inverted operand to register. + void mvn(const Register& rd, const Operand& operand); + + // System instructions. + // Move to register from system register. + void mrs(const Register& rt, SystemRegister sysreg); + + // Move from register to system register. + void msr(SystemRegister sysreg, const Register& rt); + + // System hint. + void hint(SystemHint code); + + // Data memory barrier. + void dmb(BarrierDomain domain, BarrierType type); + + // Data synchronization barrier. + void dsb(BarrierDomain domain, BarrierType type); + + // Instruction synchronization barrier. + void isb(); + + // Alias for system instructions. + // No-op. + void nop() { + hint(NOP); + } + + // FP instructions. + // Move double precision immediate to FP register. + void fmov(const FPRegister& fd, double imm); + + // Move single precision immediate to FP register. + void fmov(const FPRegister& fd, float imm); + + // Move FP register to register. + void fmov(const Register& rd, const FPRegister& fn); + + // Move register to FP register. + void fmov(const FPRegister& fd, const Register& rn); + + // Move FP register to FP register. + void fmov(const FPRegister& fd, const FPRegister& fn); + + // FP add. + void fadd(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP subtract. + void fsub(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP multiply. + void fmul(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP fused multiply and add. + void fmadd(const FPRegister& fd, + const FPRegister& fn, + const FPRegister& fm, + const FPRegister& fa); + + // FP fused multiply and subtract. + void fmsub(const FPRegister& fd, + const FPRegister& fn, + const FPRegister& fm, + const FPRegister& fa); + + // FP fused multiply, add and negate. + void fnmadd(const FPRegister& fd, + const FPRegister& fn, + const FPRegister& fm, + const FPRegister& fa); + + // FP fused multiply, subtract and negate. + void fnmsub(const FPRegister& fd, + const FPRegister& fn, + const FPRegister& fm, + const FPRegister& fa); + + // FP divide. + void fdiv(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP maximum. + void fmax(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP minimum. + void fmin(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP maximum number. + void fmaxnm(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP minimum number. + void fminnm(const FPRegister& fd, const FPRegister& fn, const FPRegister& fm); + + // FP absolute. + void fabs(const FPRegister& fd, const FPRegister& fn); + + // FP negate. + void fneg(const FPRegister& fd, const FPRegister& fn); + + // FP square root. + void fsqrt(const FPRegister& fd, const FPRegister& fn); + + // FP round to integer (nearest with ties to away). + void frinta(const FPRegister& fd, const FPRegister& fn); + + // FP round to integer (toward minus infinity). + void frintm(const FPRegister& fd, const FPRegister& fn); + + // FP round to integer (nearest with ties to even). + void frintn(const FPRegister& fd, const FPRegister& fn); + + // FP round to integer (towards zero). + void frintz(const FPRegister& fd, const FPRegister& fn); + + // FP compare registers. + void fcmp(const FPRegister& fn, const FPRegister& fm); + + // FP compare immediate. + void fcmp(const FPRegister& fn, double value); + + // FP conditional compare. + void fccmp(const FPRegister& fn, + const FPRegister& fm, + StatusFlags nzcv, + Condition cond); + + // FP conditional select. + void fcsel(const FPRegister& fd, + const FPRegister& fn, + const FPRegister& fm, + Condition cond); + + // Common FP Convert function. + void FPConvertToInt(const Register& rd, + const FPRegister& fn, + FPIntegerConvertOp op); + + // FP convert between single and double precision. + void fcvt(const FPRegister& fd, const FPRegister& fn); + + // Convert FP to signed integer (nearest with ties to away). + void fcvtas(const Register& rd, const FPRegister& fn); + + // Convert FP to unsigned integer (nearest with ties to away). + void fcvtau(const Register& rd, const FPRegister& fn); + + // Convert FP to signed integer (round towards -infinity). + void fcvtms(const Register& rd, const FPRegister& fn); + + // Convert FP to unsigned integer (round towards -infinity). + void fcvtmu(const Register& rd, const FPRegister& fn); + + // Convert FP to signed integer (nearest with ties to even). + void fcvtns(const Register& rd, const FPRegister& fn); + + // Convert FP to unsigned integer (nearest with ties to even). + void fcvtnu(const Register& rd, const FPRegister& fn); + + // Convert FP to signed integer (round towards zero). + void fcvtzs(const Register& rd, const FPRegister& fn); + + // Convert FP to unsigned integer (round towards zero). + void fcvtzu(const Register& rd, const FPRegister& fn); + + // Convert signed integer or fixed point to FP. + void scvtf(const FPRegister& fd, const Register& rn, unsigned fbits = 0); + + // Convert unsigned integer or fixed point to FP. + void ucvtf(const FPRegister& fd, const Register& rn, unsigned fbits = 0); + + // Emit generic instructions. + // Emit raw instructions into the instruction stream. + inline void dci(Instr raw_inst) { Emit(raw_inst); } + + // Emit 32 bits of data into the instruction stream. + inline void dc32(uint32_t data) { EmitData(&data, sizeof(data)); } + + // Emit 64 bits of data into the instruction stream. + inline void dc64(uint64_t data) { EmitData(&data, sizeof(data)); } + + // Copy a string into the instruction stream, including the terminating NULL + // character. The instruction pointer (pc_) is then aligned correctly for + // subsequent instructions. + void EmitStringData(const char * string) { + VIXL_ASSERT(string != NULL); + + size_t len = strlen(string) + 1; + EmitData(string, len); + + // Pad with NULL characters until pc_ is aligned. + const char pad[] = {'\0', '\0', '\0', '\0'}; + VIXL_STATIC_ASSERT(sizeof(pad) == kInstructionSize); + Instruction* next_pc = AlignUp(pc_, kInstructionSize); + EmitData(&pad, next_pc - pc_); + } + + // Code generation helpers. + + // Register encoding. + static Instr Rd(CPURegister rd) { + VIXL_ASSERT(rd.code() != kSPRegInternalCode); + return rd.code() << Rd_offset; + } + + static Instr Rn(CPURegister rn) { + VIXL_ASSERT(rn.code() != kSPRegInternalCode); + return rn.code() << Rn_offset; + } + + static Instr Rm(CPURegister rm) { + VIXL_ASSERT(rm.code() != kSPRegInternalCode); + return rm.code() << Rm_offset; + } + + static Instr Ra(CPURegister ra) { + VIXL_ASSERT(ra.code() != kSPRegInternalCode); + return ra.code() << Ra_offset; + } + + static Instr Rt(CPURegister rt) { + VIXL_ASSERT(rt.code() != kSPRegInternalCode); + return rt.code() << Rt_offset; + } + + static Instr Rt2(CPURegister rt2) { + VIXL_ASSERT(rt2.code() != kSPRegInternalCode); + return rt2.code() << Rt2_offset; + } + + // These encoding functions allow the stack pointer to be encoded, and + // disallow the zero register. + static Instr RdSP(Register rd) { + VIXL_ASSERT(!rd.IsZero()); + return (rd.code() & kRegCodeMask) << Rd_offset; + } + + static Instr RnSP(Register rn) { + VIXL_ASSERT(!rn.IsZero()); + return (rn.code() & kRegCodeMask) << Rn_offset; + } + + // Flags encoding. + static Instr Flags(FlagsUpdate S) { + if (S == SetFlags) { + return 1 << FlagsUpdate_offset; + } else if (S == LeaveFlags) { + return 0 << FlagsUpdate_offset; + } + VIXL_UNREACHABLE(); + return 0; + } + + static Instr Cond(Condition cond) { + return cond << Condition_offset; + } + + // PC-relative address encoding. + static Instr ImmPCRelAddress(int imm21) { + VIXL_ASSERT(is_int21(imm21)); + Instr imm = static_cast(truncate_to_int21(imm21)); + Instr immhi = (imm >> ImmPCRelLo_width) << ImmPCRelHi_offset; + Instr immlo = imm << ImmPCRelLo_offset; + return (immhi & ImmPCRelHi_mask) | (immlo & ImmPCRelLo_mask); + } + + // Branch encoding. + static Instr ImmUncondBranch(int imm26) { + VIXL_ASSERT(is_int26(imm26)); + return truncate_to_int26(imm26) << ImmUncondBranch_offset; + } + + static Instr ImmCondBranch(int imm19) { + VIXL_ASSERT(is_int19(imm19)); + return truncate_to_int19(imm19) << ImmCondBranch_offset; + } + + static Instr ImmCmpBranch(int imm19) { + VIXL_ASSERT(is_int19(imm19)); + return truncate_to_int19(imm19) << ImmCmpBranch_offset; + } + + static Instr ImmTestBranch(int imm14) { + VIXL_ASSERT(is_int14(imm14)); + return truncate_to_int14(imm14) << ImmTestBranch_offset; + } + + static Instr ImmTestBranchBit(unsigned bit_pos) { + VIXL_ASSERT(is_uint6(bit_pos)); + // Subtract five from the shift offset, as we need bit 5 from bit_pos. + unsigned b5 = bit_pos << (ImmTestBranchBit5_offset - 5); + unsigned b40 = bit_pos << ImmTestBranchBit40_offset; + b5 &= ImmTestBranchBit5_mask; + b40 &= ImmTestBranchBit40_mask; + return b5 | b40; + } + + // Data Processing encoding. + static Instr SF(Register rd) { + return rd.Is64Bits() ? SixtyFourBits : ThirtyTwoBits; + } + + static Instr ImmAddSub(int64_t imm) { + VIXL_ASSERT(IsImmAddSub(imm)); + if (is_uint12(imm)) { // No shift required. + return imm << ImmAddSub_offset; + } else { + return ((imm >> 12) << ImmAddSub_offset) | (1 << ShiftAddSub_offset); + } + } + + static inline Instr ImmS(unsigned imms, unsigned reg_size) { + VIXL_ASSERT(((reg_size == kXRegSize) && is_uint6(imms)) || + ((reg_size == kWRegSize) && is_uint5(imms))); + USE(reg_size); + return imms << ImmS_offset; + } + + static inline Instr ImmR(unsigned immr, unsigned reg_size) { + VIXL_ASSERT(((reg_size == kXRegSize) && is_uint6(immr)) || + ((reg_size == kWRegSize) && is_uint5(immr))); + USE(reg_size); + VIXL_ASSERT(is_uint6(immr)); + return immr << ImmR_offset; + } + + static inline Instr ImmSetBits(unsigned imms, unsigned reg_size) { + VIXL_ASSERT((reg_size == kWRegSize) || (reg_size == kXRegSize)); + VIXL_ASSERT(is_uint6(imms)); + VIXL_ASSERT((reg_size == kXRegSize) || is_uint6(imms + 3)); + USE(reg_size); + return imms << ImmSetBits_offset; + } + + static inline Instr ImmRotate(unsigned immr, unsigned reg_size) { + VIXL_ASSERT((reg_size == kWRegSize) || (reg_size == kXRegSize)); + VIXL_ASSERT(((reg_size == kXRegSize) && is_uint6(immr)) || + ((reg_size == kWRegSize) && is_uint5(immr))); + USE(reg_size); + return immr << ImmRotate_offset; + } + + static inline Instr ImmLLiteral(int imm19) { + VIXL_ASSERT(is_int19(imm19)); + return truncate_to_int19(imm19) << ImmLLiteral_offset; + } + + static inline Instr BitN(unsigned bitn, unsigned reg_size) { + VIXL_ASSERT((reg_size == kWRegSize) || (reg_size == kXRegSize)); + VIXL_ASSERT((reg_size == kXRegSize) || (bitn == 0)); + USE(reg_size); + return bitn << BitN_offset; + } + + static Instr ShiftDP(Shift shift) { + VIXL_ASSERT(shift == LSL || shift == LSR || shift == ASR || shift == ROR); + return shift << ShiftDP_offset; + } + + static Instr ImmDPShift(unsigned amount) { + VIXL_ASSERT(is_uint6(amount)); + return amount << ImmDPShift_offset; + } + + static Instr ExtendMode(Extend extend) { + return extend << ExtendMode_offset; + } + + static Instr ImmExtendShift(unsigned left_shift) { + VIXL_ASSERT(left_shift <= 4); + return left_shift << ImmExtendShift_offset; + } + + static Instr ImmCondCmp(unsigned imm) { + VIXL_ASSERT(is_uint5(imm)); + return imm << ImmCondCmp_offset; + } + + static Instr Nzcv(StatusFlags nzcv) { + return ((nzcv >> Flags_offset) & 0xf) << Nzcv_offset; + } + + // MemOperand offset encoding. + static Instr ImmLSUnsigned(int imm12) { + VIXL_ASSERT(is_uint12(imm12)); + return imm12 << ImmLSUnsigned_offset; + } + + static Instr ImmLS(int imm9) { + VIXL_ASSERT(is_int9(imm9)); + return truncate_to_int9(imm9) << ImmLS_offset; + } + + static Instr ImmLSPair(int imm7, LSDataSize size) { + VIXL_ASSERT(((imm7 >> size) << size) == imm7); + int scaled_imm7 = imm7 >> size; + VIXL_ASSERT(is_int7(scaled_imm7)); + return truncate_to_int7(scaled_imm7) << ImmLSPair_offset; + } + + static Instr ImmShiftLS(unsigned shift_amount) { + VIXL_ASSERT(is_uint1(shift_amount)); + return shift_amount << ImmShiftLS_offset; + } + + static Instr ImmException(int imm16) { + VIXL_ASSERT(is_uint16(imm16)); + return imm16 << ImmException_offset; + } + + static Instr ImmSystemRegister(int imm15) { + VIXL_ASSERT(is_uint15(imm15)); + return imm15 << ImmSystemRegister_offset; + } + + static Instr ImmHint(int imm7) { + VIXL_ASSERT(is_uint7(imm7)); + return imm7 << ImmHint_offset; + } + + static Instr ImmBarrierDomain(int imm2) { + VIXL_ASSERT(is_uint2(imm2)); + return imm2 << ImmBarrierDomain_offset; + } + + static Instr ImmBarrierType(int imm2) { + VIXL_ASSERT(is_uint2(imm2)); + return imm2 << ImmBarrierType_offset; + } + + static LSDataSize CalcLSDataSize(LoadStoreOp op) { + VIXL_ASSERT((SizeLS_offset + SizeLS_width) == (kInstructionSize * 8)); + return static_cast(op >> SizeLS_offset); + } + + // Move immediates encoding. + static Instr ImmMoveWide(uint64_t imm) { + VIXL_ASSERT(is_uint16(imm)); + return imm << ImmMoveWide_offset; + } + + static Instr ShiftMoveWide(int64_t shift) { + VIXL_ASSERT(is_uint2(shift)); + return shift << ShiftMoveWide_offset; + } + + // FP Immediates. + static Instr ImmFP32(float imm); + static Instr ImmFP64(double imm); + + // FP register type. + static Instr FPType(FPRegister fd) { + return fd.Is64Bits() ? FP64 : FP32; + } + + static Instr FPScale(unsigned scale) { + VIXL_ASSERT(is_uint6(scale)); + return scale << FPScale_offset; + } + + // Size of the code generated in bytes + uint64_t SizeOfCodeGenerated() const { + VIXL_ASSERT((pc_ >= buffer_) && (pc_ < (buffer_ + buffer_size_))); + return pc_ - buffer_; + } + + // Size of the code generated since label to the current position. + uint64_t SizeOfCodeGeneratedSince(Label* label) const { + VIXL_ASSERT(label->IsBound()); + VIXL_ASSERT((pc_ >= label->target()) && (pc_ < (buffer_ + buffer_size_))); + return pc_ - label->target(); + } + + + inline void BlockLiteralPool() { + literal_pool_monitor_++; + } + + inline void ReleaseLiteralPool() { + if (--literal_pool_monitor_ == 0) { + // Has the literal pool been blocked for too long? + VIXL_ASSERT(literals_.empty() || + (pc_ < (literals_.back()->pc_ + kMaxLoadLiteralRange))); + } + } + + inline bool IsLiteralPoolBlocked() { + return literal_pool_monitor_ != 0; + } + + void CheckLiteralPool(LiteralPoolEmitOption option = JumpRequired); + void EmitLiteralPool(LiteralPoolEmitOption option = NoJumpRequired); + size_t LiteralPoolSize(); + + protected: + inline const Register& AppropriateZeroRegFor(const CPURegister& reg) const { + return reg.Is64Bits() ? xzr : wzr; + } + + + void LoadStore(const CPURegister& rt, + const MemOperand& addr, + LoadStoreOp op); + static bool IsImmLSUnscaled(ptrdiff_t offset); + static bool IsImmLSScaled(ptrdiff_t offset, LSDataSize size); + + void Logical(const Register& rd, + const Register& rn, + const Operand& operand, + LogicalOp op); + void LogicalImmediate(const Register& rd, + const Register& rn, + unsigned n, + unsigned imm_s, + unsigned imm_r, + LogicalOp op); + static bool IsImmLogical(uint64_t value, + unsigned width, + unsigned* n, + unsigned* imm_s, + unsigned* imm_r); + + void ConditionalCompare(const Register& rn, + const Operand& operand, + StatusFlags nzcv, + Condition cond, + ConditionalCompareOp op); + static bool IsImmConditionalCompare(int64_t immediate); + + void AddSubWithCarry(const Register& rd, + const Register& rn, + const Operand& operand, + FlagsUpdate S, + AddSubWithCarryOp op); + + static bool IsImmFP32(float imm); + static bool IsImmFP64(double imm); + + // Functions for emulating operands not directly supported by the instruction + // set. + void EmitShift(const Register& rd, + const Register& rn, + Shift shift, + unsigned amount); + void EmitExtendShift(const Register& rd, + const Register& rn, + Extend extend, + unsigned left_shift); + + void AddSub(const Register& rd, + const Register& rn, + const Operand& operand, + FlagsUpdate S, + AddSubOp op); + static bool IsImmAddSub(int64_t immediate); + + // Find an appropriate LoadStoreOp or LoadStorePairOp for the specified + // registers. Only simple loads are supported; sign- and zero-extension (such + // as in LDPSW_x or LDRB_w) are not supported. + static LoadStoreOp LoadOpFor(const CPURegister& rt); + static LoadStorePairOp LoadPairOpFor(const CPURegister& rt, + const CPURegister& rt2); + static LoadStoreOp StoreOpFor(const CPURegister& rt); + static LoadStorePairOp StorePairOpFor(const CPURegister& rt, + const CPURegister& rt2); + static LoadStorePairNonTemporalOp LoadPairNonTemporalOpFor( + const CPURegister& rt, const CPURegister& rt2); + static LoadStorePairNonTemporalOp StorePairNonTemporalOpFor( + const CPURegister& rt, const CPURegister& rt2); + + + private: + // Instruction helpers. + void MoveWide(const Register& rd, + uint64_t imm, + int shift, + MoveWideImmediateOp mov_op); + void DataProcShiftedRegister(const Register& rd, + const Register& rn, + const Operand& operand, + FlagsUpdate S, + Instr op); + void DataProcExtendedRegister(const Register& rd, + const Register& rn, + const Operand& operand, + FlagsUpdate S, + Instr op); + void LoadStorePair(const CPURegister& rt, + const CPURegister& rt2, + const MemOperand& addr, + LoadStorePairOp op); + void LoadStorePairNonTemporal(const CPURegister& rt, + const CPURegister& rt2, + const MemOperand& addr, + LoadStorePairNonTemporalOp op); + void LoadLiteral(const CPURegister& rt, uint64_t imm, LoadLiteralOp op); + void ConditionalSelect(const Register& rd, + const Register& rn, + const Register& rm, + Condition cond, + ConditionalSelectOp op); + void DataProcessing1Source(const Register& rd, + const Register& rn, + DataProcessing1SourceOp op); + void DataProcessing3Source(const Register& rd, + const Register& rn, + const Register& rm, + const Register& ra, + DataProcessing3SourceOp op); + void FPDataProcessing1Source(const FPRegister& fd, + const FPRegister& fn, + FPDataProcessing1SourceOp op); + void FPDataProcessing2Source(const FPRegister& fd, + const FPRegister& fn, + const FPRegister& fm, + FPDataProcessing2SourceOp op); + void FPDataProcessing3Source(const FPRegister& fd, + const FPRegister& fn, + const FPRegister& fm, + const FPRegister& fa, + FPDataProcessing3SourceOp op); + + void RecordLiteral(int64_t imm, unsigned size); + + // Emit the instruction at pc_. + void Emit(Instr instruction) { + VIXL_STATIC_ASSERT(sizeof(*pc_) == 1); + VIXL_STATIC_ASSERT(sizeof(instruction) == kInstructionSize); + VIXL_ASSERT((pc_ + sizeof(instruction)) <= (buffer_ + buffer_size_)); + +#ifdef DEBUG + finalized_ = false; +#endif + + memcpy(pc_, &instruction, sizeof(instruction)); + pc_ += sizeof(instruction); + CheckBufferSpace(); + } + + // Emit data inline in the instruction stream. + void EmitData(void const * data, unsigned size) { + VIXL_STATIC_ASSERT(sizeof(*pc_) == 1); + VIXL_ASSERT((pc_ + size) <= (buffer_ + buffer_size_)); + +#ifdef DEBUG + finalized_ = false; +#endif + + // TODO: Record this 'instruction' as data, so that it can be disassembled + // correctly. + memcpy(pc_, data, size); + pc_ += size; + CheckBufferSpace(); + } + + inline void CheckBufferSpace() { + VIXL_ASSERT(pc_ < (buffer_ + buffer_size_)); + if (pc_ > next_literal_pool_check_) { + CheckLiteralPool(); + } + } + + // The buffer into which code and relocation info are generated. + Instruction* buffer_; + // Buffer size, in bytes. + unsigned buffer_size_; + Instruction* pc_; + std::list literals_; + Instruction* next_literal_pool_check_; + unsigned literal_pool_monitor_; + + friend class BlockLiteralPoolScope; + +#ifdef DEBUG + bool finalized_; +#endif +}; + +class BlockLiteralPoolScope { + public: + explicit BlockLiteralPoolScope(Assembler* assm) : assm_(assm) { + assm_->BlockLiteralPool(); + } + + ~BlockLiteralPoolScope() { + assm_->ReleaseLiteralPool(); + } + + private: + Assembler* assm_; +}; +} // namespace vixl + +#endif // VIXL_A64_ASSEMBLER_A64_H_ diff --git a/disas/libvixl/a64/constants-a64.h b/disas/libvixl/a64/constants-a64.h new file mode 100644 index 0000000000..99677c1be3 --- /dev/null +++ b/disas/libvixl/a64/constants-a64.h @@ -0,0 +1,1134 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_A64_CONSTANTS_A64_H_ +#define VIXL_A64_CONSTANTS_A64_H_ + +namespace vixl { + +const unsigned kNumberOfRegisters = 32; +const unsigned kNumberOfFPRegisters = 32; +// Callee saved registers are x21-x30(lr). +const int kNumberOfCalleeSavedRegisters = 10; +const int kFirstCalleeSavedRegisterIndex = 21; +// Callee saved FP registers are d8-d15. +const int kNumberOfCalleeSavedFPRegisters = 8; +const int kFirstCalleeSavedFPRegisterIndex = 8; + +#define REGISTER_CODE_LIST(R) \ +R(0) R(1) R(2) R(3) R(4) R(5) R(6) R(7) \ +R(8) R(9) R(10) R(11) R(12) R(13) R(14) R(15) \ +R(16) R(17) R(18) R(19) R(20) R(21) R(22) R(23) \ +R(24) R(25) R(26) R(27) R(28) R(29) R(30) R(31) + +#define INSTRUCTION_FIELDS_LIST(V_) \ +/* Register fields */ \ +V_(Rd, 4, 0, Bits) /* Destination register. */ \ +V_(Rn, 9, 5, Bits) /* First source register. */ \ +V_(Rm, 20, 16, Bits) /* Second source register. */ \ +V_(Ra, 14, 10, Bits) /* Third source register. */ \ +V_(Rt, 4, 0, Bits) /* Load dest / store source. */ \ +V_(Rt2, 14, 10, Bits) /* Load second dest / */ \ + /* store second source. */ \ +V_(PrefetchMode, 4, 0, Bits) \ + \ +/* Common bits */ \ +V_(SixtyFourBits, 31, 31, Bits) \ +V_(FlagsUpdate, 29, 29, Bits) \ + \ +/* PC relative addressing */ \ +V_(ImmPCRelHi, 23, 5, SignedBits) \ +V_(ImmPCRelLo, 30, 29, Bits) \ + \ +/* Add/subtract/logical shift register */ \ +V_(ShiftDP, 23, 22, Bits) \ +V_(ImmDPShift, 15, 10, Bits) \ + \ +/* Add/subtract immediate */ \ +V_(ImmAddSub, 21, 10, Bits) \ +V_(ShiftAddSub, 23, 22, Bits) \ + \ +/* Add/substract extend */ \ +V_(ImmExtendShift, 12, 10, Bits) \ +V_(ExtendMode, 15, 13, Bits) \ + \ +/* Move wide */ \ +V_(ImmMoveWide, 20, 5, Bits) \ +V_(ShiftMoveWide, 22, 21, Bits) \ + \ +/* Logical immediate, bitfield and extract */ \ +V_(BitN, 22, 22, Bits) \ +V_(ImmRotate, 21, 16, Bits) \ +V_(ImmSetBits, 15, 10, Bits) \ +V_(ImmR, 21, 16, Bits) \ +V_(ImmS, 15, 10, Bits) \ + \ +/* Test and branch immediate */ \ +V_(ImmTestBranch, 18, 5, SignedBits) \ +V_(ImmTestBranchBit40, 23, 19, Bits) \ +V_(ImmTestBranchBit5, 31, 31, Bits) \ + \ +/* Conditionals */ \ +V_(Condition, 15, 12, Bits) \ +V_(ConditionBranch, 3, 0, Bits) \ +V_(Nzcv, 3, 0, Bits) \ +V_(ImmCondCmp, 20, 16, Bits) \ +V_(ImmCondBranch, 23, 5, SignedBits) \ + \ +/* Floating point */ \ +V_(FPType, 23, 22, Bits) \ +V_(ImmFP, 20, 13, Bits) \ +V_(FPScale, 15, 10, Bits) \ + \ +/* Load Store */ \ +V_(ImmLS, 20, 12, SignedBits) \ +V_(ImmLSUnsigned, 21, 10, Bits) \ +V_(ImmLSPair, 21, 15, SignedBits) \ +V_(SizeLS, 31, 30, Bits) \ +V_(ImmShiftLS, 12, 12, Bits) \ + \ +/* Other immediates */ \ +V_(ImmUncondBranch, 25, 0, SignedBits) \ +V_(ImmCmpBranch, 23, 5, SignedBits) \ +V_(ImmLLiteral, 23, 5, SignedBits) \ +V_(ImmException, 20, 5, Bits) \ +V_(ImmHint, 11, 5, Bits) \ +V_(ImmBarrierDomain, 11, 10, Bits) \ +V_(ImmBarrierType, 9, 8, Bits) \ + \ +/* System (MRS, MSR) */ \ +V_(ImmSystemRegister, 19, 5, Bits) \ +V_(SysO0, 19, 19, Bits) \ +V_(SysOp1, 18, 16, Bits) \ +V_(SysOp2, 7, 5, Bits) \ +V_(CRn, 15, 12, Bits) \ +V_(CRm, 11, 8, Bits) \ + + +#define SYSTEM_REGISTER_FIELDS_LIST(V_, M_) \ +/* NZCV */ \ +V_(Flags, 31, 28, Bits) \ +V_(N, 31, 31, Bits) \ +V_(Z, 30, 30, Bits) \ +V_(C, 29, 29, Bits) \ +V_(V, 28, 28, Bits) \ +M_(NZCV, Flags_mask) \ + \ +/* FPCR */ \ +V_(AHP, 26, 26, Bits) \ +V_(DN, 25, 25, Bits) \ +V_(FZ, 24, 24, Bits) \ +V_(RMode, 23, 22, Bits) \ +M_(FPCR, AHP_mask | DN_mask | FZ_mask | RMode_mask) + + +// Fields offsets. +#define DECLARE_FIELDS_OFFSETS(Name, HighBit, LowBit, X) \ +const int Name##_offset = LowBit; \ +const int Name##_width = HighBit - LowBit + 1; \ +const uint32_t Name##_mask = ((1 << Name##_width) - 1) << LowBit; +#define NOTHING(A, B) +INSTRUCTION_FIELDS_LIST(DECLARE_FIELDS_OFFSETS) +SYSTEM_REGISTER_FIELDS_LIST(DECLARE_FIELDS_OFFSETS, NOTHING) +#undef NOTHING +#undef DECLARE_FIELDS_BITS + +// ImmPCRel is a compound field (not present in INSTRUCTION_FIELDS_LIST), formed +// from ImmPCRelLo and ImmPCRelHi. +const int ImmPCRel_mask = ImmPCRelLo_mask | ImmPCRelHi_mask; + +// Condition codes. +enum Condition { + eq = 0, + ne = 1, + hs = 2, + lo = 3, + mi = 4, + pl = 5, + vs = 6, + vc = 7, + hi = 8, + ls = 9, + ge = 10, + lt = 11, + gt = 12, + le = 13, + al = 14, + nv = 15 // Behaves as always/al. +}; + +inline Condition InvertCondition(Condition cond) { + // Conditions al and nv behave identically, as "always true". They can't be + // inverted, because there is no "always false" condition. + VIXL_ASSERT((cond != al) && (cond != nv)); + return static_cast(cond ^ 1); +} + +enum FlagsUpdate { + SetFlags = 1, + LeaveFlags = 0 +}; + +enum StatusFlags { + NoFlag = 0, + + // Derive the flag combinations from the system register bit descriptions. + NFlag = N_mask, + ZFlag = Z_mask, + CFlag = C_mask, + VFlag = V_mask, + NZFlag = NFlag | ZFlag, + NCFlag = NFlag | CFlag, + NVFlag = NFlag | VFlag, + ZCFlag = ZFlag | CFlag, + ZVFlag = ZFlag | VFlag, + CVFlag = CFlag | VFlag, + NZCFlag = NFlag | ZFlag | CFlag, + NZVFlag = NFlag | ZFlag | VFlag, + NCVFlag = NFlag | CFlag | VFlag, + ZCVFlag = ZFlag | CFlag | VFlag, + NZCVFlag = NFlag | ZFlag | CFlag | VFlag, + + // Floating-point comparison results. + FPEqualFlag = ZCFlag, + FPLessThanFlag = NFlag, + FPGreaterThanFlag = CFlag, + FPUnorderedFlag = CVFlag +}; + +enum Shift { + NO_SHIFT = -1, + LSL = 0x0, + LSR = 0x1, + ASR = 0x2, + ROR = 0x3 +}; + +enum Extend { + NO_EXTEND = -1, + UXTB = 0, + UXTH = 1, + UXTW = 2, + UXTX = 3, + SXTB = 4, + SXTH = 5, + SXTW = 6, + SXTX = 7 +}; + +enum SystemHint { + NOP = 0, + YIELD = 1, + WFE = 2, + WFI = 3, + SEV = 4, + SEVL = 5 +}; + +enum BarrierDomain { + OuterShareable = 0, + NonShareable = 1, + InnerShareable = 2, + FullSystem = 3 +}; + +enum BarrierType { + BarrierOther = 0, + BarrierReads = 1, + BarrierWrites = 2, + BarrierAll = 3 +}; + +// System/special register names. +// This information is not encoded as one field but as the concatenation of +// multiple fields (Op0<0>, Op1, Crn, Crm, Op2). +enum SystemRegister { + NZCV = ((0x1 << SysO0_offset) | + (0x3 << SysOp1_offset) | + (0x4 << CRn_offset) | + (0x2 << CRm_offset) | + (0x0 << SysOp2_offset)) >> ImmSystemRegister_offset, + FPCR = ((0x1 << SysO0_offset) | + (0x3 << SysOp1_offset) | + (0x4 << CRn_offset) | + (0x4 << CRm_offset) | + (0x0 << SysOp2_offset)) >> ImmSystemRegister_offset +}; + +// Instruction enumerations. +// +// These are the masks that define a class of instructions, and the list of +// instructions within each class. Each enumeration has a Fixed, FMask and +// Mask value. +// +// Fixed: The fixed bits in this instruction class. +// FMask: The mask used to extract the fixed bits in the class. +// Mask: The mask used to identify the instructions within a class. +// +// The enumerations can be used like this: +// +// VIXL_ASSERT(instr->Mask(PCRelAddressingFMask) == PCRelAddressingFixed); +// switch(instr->Mask(PCRelAddressingMask)) { +// case ADR: Format("adr 'Xd, 'AddrPCRelByte"); break; +// case ADRP: Format("adrp 'Xd, 'AddrPCRelPage"); break; +// default: printf("Unknown instruction\n"); +// } + + +// Generic fields. +enum GenericInstrField { + SixtyFourBits = 0x80000000, + ThirtyTwoBits = 0x00000000, + FP32 = 0x00000000, + FP64 = 0x00400000 +}; + +// PC relative addressing. +enum PCRelAddressingOp { + PCRelAddressingFixed = 0x10000000, + PCRelAddressingFMask = 0x1F000000, + PCRelAddressingMask = 0x9F000000, + ADR = PCRelAddressingFixed | 0x00000000, + ADRP = PCRelAddressingFixed | 0x80000000 +}; + +// Add/sub (immediate, shifted and extended.) +const int kSFOffset = 31; +enum AddSubOp { + AddSubOpMask = 0x60000000, + AddSubSetFlagsBit = 0x20000000, + ADD = 0x00000000, + ADDS = ADD | AddSubSetFlagsBit, + SUB = 0x40000000, + SUBS = SUB | AddSubSetFlagsBit +}; + +#define ADD_SUB_OP_LIST(V) \ + V(ADD), \ + V(ADDS), \ + V(SUB), \ + V(SUBS) + +enum AddSubImmediateOp { + AddSubImmediateFixed = 0x11000000, + AddSubImmediateFMask = 0x1F000000, + AddSubImmediateMask = 0xFF000000, + #define ADD_SUB_IMMEDIATE(A) \ + A##_w_imm = AddSubImmediateFixed | A, \ + A##_x_imm = AddSubImmediateFixed | A | SixtyFourBits + ADD_SUB_OP_LIST(ADD_SUB_IMMEDIATE) + #undef ADD_SUB_IMMEDIATE +}; + +enum AddSubShiftedOp { + AddSubShiftedFixed = 0x0B000000, + AddSubShiftedFMask = 0x1F200000, + AddSubShiftedMask = 0xFF200000, + #define ADD_SUB_SHIFTED(A) \ + A##_w_shift = AddSubShiftedFixed | A, \ + A##_x_shift = AddSubShiftedFixed | A | SixtyFourBits + ADD_SUB_OP_LIST(ADD_SUB_SHIFTED) + #undef ADD_SUB_SHIFTED +}; + +enum AddSubExtendedOp { + AddSubExtendedFixed = 0x0B200000, + AddSubExtendedFMask = 0x1F200000, + AddSubExtendedMask = 0xFFE00000, + #define ADD_SUB_EXTENDED(A) \ + A##_w_ext = AddSubExtendedFixed | A, \ + A##_x_ext = AddSubExtendedFixed | A | SixtyFourBits + ADD_SUB_OP_LIST(ADD_SUB_EXTENDED) + #undef ADD_SUB_EXTENDED +}; + +// Add/sub with carry. +enum AddSubWithCarryOp { + AddSubWithCarryFixed = 0x1A000000, + AddSubWithCarryFMask = 0x1FE00000, + AddSubWithCarryMask = 0xFFE0FC00, + ADC_w = AddSubWithCarryFixed | ADD, + ADC_x = AddSubWithCarryFixed | ADD | SixtyFourBits, + ADC = ADC_w, + ADCS_w = AddSubWithCarryFixed | ADDS, + ADCS_x = AddSubWithCarryFixed | ADDS | SixtyFourBits, + SBC_w = AddSubWithCarryFixed | SUB, + SBC_x = AddSubWithCarryFixed | SUB | SixtyFourBits, + SBC = SBC_w, + SBCS_w = AddSubWithCarryFixed | SUBS, + SBCS_x = AddSubWithCarryFixed | SUBS | SixtyFourBits +}; + + +// Logical (immediate and shifted register). +enum LogicalOp { + LogicalOpMask = 0x60200000, + NOT = 0x00200000, + AND = 0x00000000, + BIC = AND | NOT, + ORR = 0x20000000, + ORN = ORR | NOT, + EOR = 0x40000000, + EON = EOR | NOT, + ANDS = 0x60000000, + BICS = ANDS | NOT +}; + +// Logical immediate. +enum LogicalImmediateOp { + LogicalImmediateFixed = 0x12000000, + LogicalImmediateFMask = 0x1F800000, + LogicalImmediateMask = 0xFF800000, + AND_w_imm = LogicalImmediateFixed | AND, + AND_x_imm = LogicalImmediateFixed | AND | SixtyFourBits, + ORR_w_imm = LogicalImmediateFixed | ORR, + ORR_x_imm = LogicalImmediateFixed | ORR | SixtyFourBits, + EOR_w_imm = LogicalImmediateFixed | EOR, + EOR_x_imm = LogicalImmediateFixed | EOR | SixtyFourBits, + ANDS_w_imm = LogicalImmediateFixed | ANDS, + ANDS_x_imm = LogicalImmediateFixed | ANDS | SixtyFourBits +}; + +// Logical shifted register. +enum LogicalShiftedOp { + LogicalShiftedFixed = 0x0A000000, + LogicalShiftedFMask = 0x1F000000, + LogicalShiftedMask = 0xFF200000, + AND_w = LogicalShiftedFixed | AND, + AND_x = LogicalShiftedFixed | AND | SixtyFourBits, + AND_shift = AND_w, + BIC_w = LogicalShiftedFixed | BIC, + BIC_x = LogicalShiftedFixed | BIC | SixtyFourBits, + BIC_shift = BIC_w, + ORR_w = LogicalShiftedFixed | ORR, + ORR_x = LogicalShiftedFixed | ORR | SixtyFourBits, + ORR_shift = ORR_w, + ORN_w = LogicalShiftedFixed | ORN, + ORN_x = LogicalShiftedFixed | ORN | SixtyFourBits, + ORN_shift = ORN_w, + EOR_w = LogicalShiftedFixed | EOR, + EOR_x = LogicalShiftedFixed | EOR | SixtyFourBits, + EOR_shift = EOR_w, + EON_w = LogicalShiftedFixed | EON, + EON_x = LogicalShiftedFixed | EON | SixtyFourBits, + EON_shift = EON_w, + ANDS_w = LogicalShiftedFixed | ANDS, + ANDS_x = LogicalShiftedFixed | ANDS | SixtyFourBits, + ANDS_shift = ANDS_w, + BICS_w = LogicalShiftedFixed | BICS, + BICS_x = LogicalShiftedFixed | BICS | SixtyFourBits, + BICS_shift = BICS_w +}; + +// Move wide immediate. +enum MoveWideImmediateOp { + MoveWideImmediateFixed = 0x12800000, + MoveWideImmediateFMask = 0x1F800000, + MoveWideImmediateMask = 0xFF800000, + MOVN = 0x00000000, + MOVZ = 0x40000000, + MOVK = 0x60000000, + MOVN_w = MoveWideImmediateFixed | MOVN, + MOVN_x = MoveWideImmediateFixed | MOVN | SixtyFourBits, + MOVZ_w = MoveWideImmediateFixed | MOVZ, + MOVZ_x = MoveWideImmediateFixed | MOVZ | SixtyFourBits, + MOVK_w = MoveWideImmediateFixed | MOVK, + MOVK_x = MoveWideImmediateFixed | MOVK | SixtyFourBits +}; + +// Bitfield. +const int kBitfieldNOffset = 22; +enum BitfieldOp { + BitfieldFixed = 0x13000000, + BitfieldFMask = 0x1F800000, + BitfieldMask = 0xFF800000, + SBFM_w = BitfieldFixed | 0x00000000, + SBFM_x = BitfieldFixed | 0x80000000, + SBFM = SBFM_w, + BFM_w = BitfieldFixed | 0x20000000, + BFM_x = BitfieldFixed | 0xA0000000, + BFM = BFM_w, + UBFM_w = BitfieldFixed | 0x40000000, + UBFM_x = BitfieldFixed | 0xC0000000, + UBFM = UBFM_w + // Bitfield N field. +}; + +// Extract. +enum ExtractOp { + ExtractFixed = 0x13800000, + ExtractFMask = 0x1F800000, + ExtractMask = 0xFFA00000, + EXTR_w = ExtractFixed | 0x00000000, + EXTR_x = ExtractFixed | 0x80000000, + EXTR = EXTR_w +}; + +// Unconditional branch. +enum UnconditionalBranchOp { + UnconditionalBranchFixed = 0x14000000, + UnconditionalBranchFMask = 0x7C000000, + UnconditionalBranchMask = 0xFC000000, + B = UnconditionalBranchFixed | 0x00000000, + BL = UnconditionalBranchFixed | 0x80000000 +}; + +// Unconditional branch to register. +enum UnconditionalBranchToRegisterOp { + UnconditionalBranchToRegisterFixed = 0xD6000000, + UnconditionalBranchToRegisterFMask = 0xFE000000, + UnconditionalBranchToRegisterMask = 0xFFFFFC1F, + BR = UnconditionalBranchToRegisterFixed | 0x001F0000, + BLR = UnconditionalBranchToRegisterFixed | 0x003F0000, + RET = UnconditionalBranchToRegisterFixed | 0x005F0000 +}; + +// Compare and branch. +enum CompareBranchOp { + CompareBranchFixed = 0x34000000, + CompareBranchFMask = 0x7E000000, + CompareBranchMask = 0xFF000000, + CBZ_w = CompareBranchFixed | 0x00000000, + CBZ_x = CompareBranchFixed | 0x80000000, + CBZ = CBZ_w, + CBNZ_w = CompareBranchFixed | 0x01000000, + CBNZ_x = CompareBranchFixed | 0x81000000, + CBNZ = CBNZ_w +}; + +// Test and branch. +enum TestBranchOp { + TestBranchFixed = 0x36000000, + TestBranchFMask = 0x7E000000, + TestBranchMask = 0x7F000000, + TBZ = TestBranchFixed | 0x00000000, + TBNZ = TestBranchFixed | 0x01000000 +}; + +// Conditional branch. +enum ConditionalBranchOp { + ConditionalBranchFixed = 0x54000000, + ConditionalBranchFMask = 0xFE000000, + ConditionalBranchMask = 0xFF000010, + B_cond = ConditionalBranchFixed | 0x00000000 +}; + +// System. +// System instruction encoding is complicated because some instructions use op +// and CR fields to encode parameters. To handle this cleanly, the system +// instructions are split into more than one enum. + +enum SystemOp { + SystemFixed = 0xD5000000, + SystemFMask = 0xFFC00000 +}; + +enum SystemSysRegOp { + SystemSysRegFixed = 0xD5100000, + SystemSysRegFMask = 0xFFD00000, + SystemSysRegMask = 0xFFF00000, + MRS = SystemSysRegFixed | 0x00200000, + MSR = SystemSysRegFixed | 0x00000000 +}; + +enum SystemHintOp { + SystemHintFixed = 0xD503201F, + SystemHintFMask = 0xFFFFF01F, + SystemHintMask = 0xFFFFF01F, + HINT = SystemHintFixed | 0x00000000 +}; + +// Exception. +enum ExceptionOp { + ExceptionFixed = 0xD4000000, + ExceptionFMask = 0xFF000000, + ExceptionMask = 0xFFE0001F, + HLT = ExceptionFixed | 0x00400000, + BRK = ExceptionFixed | 0x00200000, + SVC = ExceptionFixed | 0x00000001, + HVC = ExceptionFixed | 0x00000002, + SMC = ExceptionFixed | 0x00000003, + DCPS1 = ExceptionFixed | 0x00A00001, + DCPS2 = ExceptionFixed | 0x00A00002, + DCPS3 = ExceptionFixed | 0x00A00003 +}; + +enum MemBarrierOp { + MemBarrierFixed = 0xD503309F, + MemBarrierFMask = 0xFFFFF09F, + MemBarrierMask = 0xFFFFF0FF, + DSB = MemBarrierFixed | 0x00000000, + DMB = MemBarrierFixed | 0x00000020, + ISB = MemBarrierFixed | 0x00000040 +}; + +// Any load or store. +enum LoadStoreAnyOp { + LoadStoreAnyFMask = 0x0a000000, + LoadStoreAnyFixed = 0x08000000 +}; + +#define LOAD_STORE_PAIR_OP_LIST(V) \ + V(STP, w, 0x00000000), \ + V(LDP, w, 0x00400000), \ + V(LDPSW, x, 0x40400000), \ + V(STP, x, 0x80000000), \ + V(LDP, x, 0x80400000), \ + V(STP, s, 0x04000000), \ + V(LDP, s, 0x04400000), \ + V(STP, d, 0x44000000), \ + V(LDP, d, 0x44400000) + +// Load/store pair (post, pre and offset.) +enum LoadStorePairOp { + LoadStorePairMask = 0xC4400000, + LoadStorePairLBit = 1 << 22, + #define LOAD_STORE_PAIR(A, B, C) \ + A##_##B = C + LOAD_STORE_PAIR_OP_LIST(LOAD_STORE_PAIR) + #undef LOAD_STORE_PAIR +}; + +enum LoadStorePairPostIndexOp { + LoadStorePairPostIndexFixed = 0x28800000, + LoadStorePairPostIndexFMask = 0x3B800000, + LoadStorePairPostIndexMask = 0xFFC00000, + #define LOAD_STORE_PAIR_POST_INDEX(A, B, C) \ + A##_##B##_post = LoadStorePairPostIndexFixed | A##_##B + LOAD_STORE_PAIR_OP_LIST(LOAD_STORE_PAIR_POST_INDEX) + #undef LOAD_STORE_PAIR_POST_INDEX +}; + +enum LoadStorePairPreIndexOp { + LoadStorePairPreIndexFixed = 0x29800000, + LoadStorePairPreIndexFMask = 0x3B800000, + LoadStorePairPreIndexMask = 0xFFC00000, + #define LOAD_STORE_PAIR_PRE_INDEX(A, B, C) \ + A##_##B##_pre = LoadStorePairPreIndexFixed | A##_##B + LOAD_STORE_PAIR_OP_LIST(LOAD_STORE_PAIR_PRE_INDEX) + #undef LOAD_STORE_PAIR_PRE_INDEX +}; + +enum LoadStorePairOffsetOp { + LoadStorePairOffsetFixed = 0x29000000, + LoadStorePairOffsetFMask = 0x3B800000, + LoadStorePairOffsetMask = 0xFFC00000, + #define LOAD_STORE_PAIR_OFFSET(A, B, C) \ + A##_##B##_off = LoadStorePairOffsetFixed | A##_##B + LOAD_STORE_PAIR_OP_LIST(LOAD_STORE_PAIR_OFFSET) + #undef LOAD_STORE_PAIR_OFFSET +}; + +enum LoadStorePairNonTemporalOp { + LoadStorePairNonTemporalFixed = 0x28000000, + LoadStorePairNonTemporalFMask = 0x3B800000, + LoadStorePairNonTemporalMask = 0xFFC00000, + STNP_w = LoadStorePairNonTemporalFixed | STP_w, + LDNP_w = LoadStorePairNonTemporalFixed | LDP_w, + STNP_x = LoadStorePairNonTemporalFixed | STP_x, + LDNP_x = LoadStorePairNonTemporalFixed | LDP_x, + STNP_s = LoadStorePairNonTemporalFixed | STP_s, + LDNP_s = LoadStorePairNonTemporalFixed | LDP_s, + STNP_d = LoadStorePairNonTemporalFixed | STP_d, + LDNP_d = LoadStorePairNonTemporalFixed | LDP_d +}; + +// Load literal. +enum LoadLiteralOp { + LoadLiteralFixed = 0x18000000, + LoadLiteralFMask = 0x3B000000, + LoadLiteralMask = 0xFF000000, + LDR_w_lit = LoadLiteralFixed | 0x00000000, + LDR_x_lit = LoadLiteralFixed | 0x40000000, + LDRSW_x_lit = LoadLiteralFixed | 0x80000000, + PRFM_lit = LoadLiteralFixed | 0xC0000000, + LDR_s_lit = LoadLiteralFixed | 0x04000000, + LDR_d_lit = LoadLiteralFixed | 0x44000000 +}; + +#define LOAD_STORE_OP_LIST(V) \ + V(ST, RB, w, 0x00000000), \ + V(ST, RH, w, 0x40000000), \ + V(ST, R, w, 0x80000000), \ + V(ST, R, x, 0xC0000000), \ + V(LD, RB, w, 0x00400000), \ + V(LD, RH, w, 0x40400000), \ + V(LD, R, w, 0x80400000), \ + V(LD, R, x, 0xC0400000), \ + V(LD, RSB, x, 0x00800000), \ + V(LD, RSH, x, 0x40800000), \ + V(LD, RSW, x, 0x80800000), \ + V(LD, RSB, w, 0x00C00000), \ + V(LD, RSH, w, 0x40C00000), \ + V(ST, R, s, 0x84000000), \ + V(ST, R, d, 0xC4000000), \ + V(LD, R, s, 0x84400000), \ + V(LD, R, d, 0xC4400000) + + +// Load/store unscaled offset. +enum LoadStoreUnscaledOffsetOp { + LoadStoreUnscaledOffsetFixed = 0x38000000, + LoadStoreUnscaledOffsetFMask = 0x3B200C00, + LoadStoreUnscaledOffsetMask = 0xFFE00C00, + #define LOAD_STORE_UNSCALED(A, B, C, D) \ + A##U##B##_##C = LoadStoreUnscaledOffsetFixed | D + LOAD_STORE_OP_LIST(LOAD_STORE_UNSCALED) + #undef LOAD_STORE_UNSCALED +}; + +// Load/store (post, pre, offset and unsigned.) +enum LoadStoreOp { + LoadStoreOpMask = 0xC4C00000, + #define LOAD_STORE(A, B, C, D) \ + A##B##_##C = D + LOAD_STORE_OP_LIST(LOAD_STORE), + #undef LOAD_STORE + PRFM = 0xC0800000 +}; + +// Load/store post index. +enum LoadStorePostIndex { + LoadStorePostIndexFixed = 0x38000400, + LoadStorePostIndexFMask = 0x3B200C00, + LoadStorePostIndexMask = 0xFFE00C00, + #define LOAD_STORE_POST_INDEX(A, B, C, D) \ + A##B##_##C##_post = LoadStorePostIndexFixed | D + LOAD_STORE_OP_LIST(LOAD_STORE_POST_INDEX) + #undef LOAD_STORE_POST_INDEX +}; + +// Load/store pre index. +enum LoadStorePreIndex { + LoadStorePreIndexFixed = 0x38000C00, + LoadStorePreIndexFMask = 0x3B200C00, + LoadStorePreIndexMask = 0xFFE00C00, + #define LOAD_STORE_PRE_INDEX(A, B, C, D) \ + A##B##_##C##_pre = LoadStorePreIndexFixed | D + LOAD_STORE_OP_LIST(LOAD_STORE_PRE_INDEX) + #undef LOAD_STORE_PRE_INDEX +}; + +// Load/store unsigned offset. +enum LoadStoreUnsignedOffset { + LoadStoreUnsignedOffsetFixed = 0x39000000, + LoadStoreUnsignedOffsetFMask = 0x3B000000, + LoadStoreUnsignedOffsetMask = 0xFFC00000, + PRFM_unsigned = LoadStoreUnsignedOffsetFixed | PRFM, + #define LOAD_STORE_UNSIGNED_OFFSET(A, B, C, D) \ + A##B##_##C##_unsigned = LoadStoreUnsignedOffsetFixed | D + LOAD_STORE_OP_LIST(LOAD_STORE_UNSIGNED_OFFSET) + #undef LOAD_STORE_UNSIGNED_OFFSET +}; + +// Load/store register offset. +enum LoadStoreRegisterOffset { + LoadStoreRegisterOffsetFixed = 0x38200800, + LoadStoreRegisterOffsetFMask = 0x3B200C00, + LoadStoreRegisterOffsetMask = 0xFFE00C00, + PRFM_reg = LoadStoreRegisterOffsetFixed | PRFM, + #define LOAD_STORE_REGISTER_OFFSET(A, B, C, D) \ + A##B##_##C##_reg = LoadStoreRegisterOffsetFixed | D + LOAD_STORE_OP_LIST(LOAD_STORE_REGISTER_OFFSET) + #undef LOAD_STORE_REGISTER_OFFSET +}; + +// Conditional compare. +enum ConditionalCompareOp { + ConditionalCompareMask = 0x60000000, + CCMN = 0x20000000, + CCMP = 0x60000000 +}; + +// Conditional compare register. +enum ConditionalCompareRegisterOp { + ConditionalCompareRegisterFixed = 0x1A400000, + ConditionalCompareRegisterFMask = 0x1FE00800, + ConditionalCompareRegisterMask = 0xFFE00C10, + CCMN_w = ConditionalCompareRegisterFixed | CCMN, + CCMN_x = ConditionalCompareRegisterFixed | SixtyFourBits | CCMN, + CCMP_w = ConditionalCompareRegisterFixed | CCMP, + CCMP_x = ConditionalCompareRegisterFixed | SixtyFourBits | CCMP +}; + +// Conditional compare immediate. +enum ConditionalCompareImmediateOp { + ConditionalCompareImmediateFixed = 0x1A400800, + ConditionalCompareImmediateFMask = 0x1FE00800, + ConditionalCompareImmediateMask = 0xFFE00C10, + CCMN_w_imm = ConditionalCompareImmediateFixed | CCMN, + CCMN_x_imm = ConditionalCompareImmediateFixed | SixtyFourBits | CCMN, + CCMP_w_imm = ConditionalCompareImmediateFixed | CCMP, + CCMP_x_imm = ConditionalCompareImmediateFixed | SixtyFourBits | CCMP +}; + +// Conditional select. +enum ConditionalSelectOp { + ConditionalSelectFixed = 0x1A800000, + ConditionalSelectFMask = 0x1FE00000, + ConditionalSelectMask = 0xFFE00C00, + CSEL_w = ConditionalSelectFixed | 0x00000000, + CSEL_x = ConditionalSelectFixed | 0x80000000, + CSEL = CSEL_w, + CSINC_w = ConditionalSelectFixed | 0x00000400, + CSINC_x = ConditionalSelectFixed | 0x80000400, + CSINC = CSINC_w, + CSINV_w = ConditionalSelectFixed | 0x40000000, + CSINV_x = ConditionalSelectFixed | 0xC0000000, + CSINV = CSINV_w, + CSNEG_w = ConditionalSelectFixed | 0x40000400, + CSNEG_x = ConditionalSelectFixed | 0xC0000400, + CSNEG = CSNEG_w +}; + +// Data processing 1 source. +enum DataProcessing1SourceOp { + DataProcessing1SourceFixed = 0x5AC00000, + DataProcessing1SourceFMask = 0x5FE00000, + DataProcessing1SourceMask = 0xFFFFFC00, + RBIT = DataProcessing1SourceFixed | 0x00000000, + RBIT_w = RBIT, + RBIT_x = RBIT | SixtyFourBits, + REV16 = DataProcessing1SourceFixed | 0x00000400, + REV16_w = REV16, + REV16_x = REV16 | SixtyFourBits, + REV = DataProcessing1SourceFixed | 0x00000800, + REV_w = REV, + REV32_x = REV | SixtyFourBits, + REV_x = DataProcessing1SourceFixed | SixtyFourBits | 0x00000C00, + CLZ = DataProcessing1SourceFixed | 0x00001000, + CLZ_w = CLZ, + CLZ_x = CLZ | SixtyFourBits, + CLS = DataProcessing1SourceFixed | 0x00001400, + CLS_w = CLS, + CLS_x = CLS | SixtyFourBits +}; + +// Data processing 2 source. +enum DataProcessing2SourceOp { + DataProcessing2SourceFixed = 0x1AC00000, + DataProcessing2SourceFMask = 0x5FE00000, + DataProcessing2SourceMask = 0xFFE0FC00, + UDIV_w = DataProcessing2SourceFixed | 0x00000800, + UDIV_x = DataProcessing2SourceFixed | 0x80000800, + UDIV = UDIV_w, + SDIV_w = DataProcessing2SourceFixed | 0x00000C00, + SDIV_x = DataProcessing2SourceFixed | 0x80000C00, + SDIV = SDIV_w, + LSLV_w = DataProcessing2SourceFixed | 0x00002000, + LSLV_x = DataProcessing2SourceFixed | 0x80002000, + LSLV = LSLV_w, + LSRV_w = DataProcessing2SourceFixed | 0x00002400, + LSRV_x = DataProcessing2SourceFixed | 0x80002400, + LSRV = LSRV_w, + ASRV_w = DataProcessing2SourceFixed | 0x00002800, + ASRV_x = DataProcessing2SourceFixed | 0x80002800, + ASRV = ASRV_w, + RORV_w = DataProcessing2SourceFixed | 0x00002C00, + RORV_x = DataProcessing2SourceFixed | 0x80002C00, + RORV = RORV_w, + CRC32B = DataProcessing2SourceFixed | 0x00004000, + CRC32H = DataProcessing2SourceFixed | 0x00004400, + CRC32W = DataProcessing2SourceFixed | 0x00004800, + CRC32X = DataProcessing2SourceFixed | SixtyFourBits | 0x00004C00, + CRC32CB = DataProcessing2SourceFixed | 0x00005000, + CRC32CH = DataProcessing2SourceFixed | 0x00005400, + CRC32CW = DataProcessing2SourceFixed | 0x00005800, + CRC32CX = DataProcessing2SourceFixed | SixtyFourBits | 0x00005C00 +}; + +// Data processing 3 source. +enum DataProcessing3SourceOp { + DataProcessing3SourceFixed = 0x1B000000, + DataProcessing3SourceFMask = 0x1F000000, + DataProcessing3SourceMask = 0xFFE08000, + MADD_w = DataProcessing3SourceFixed | 0x00000000, + MADD_x = DataProcessing3SourceFixed | 0x80000000, + MADD = MADD_w, + MSUB_w = DataProcessing3SourceFixed | 0x00008000, + MSUB_x = DataProcessing3SourceFixed | 0x80008000, + MSUB = MSUB_w, + SMADDL_x = DataProcessing3SourceFixed | 0x80200000, + SMSUBL_x = DataProcessing3SourceFixed | 0x80208000, + SMULH_x = DataProcessing3SourceFixed | 0x80400000, + UMADDL_x = DataProcessing3SourceFixed | 0x80A00000, + UMSUBL_x = DataProcessing3SourceFixed | 0x80A08000, + UMULH_x = DataProcessing3SourceFixed | 0x80C00000 +}; + +// Floating point compare. +enum FPCompareOp { + FPCompareFixed = 0x1E202000, + FPCompareFMask = 0x5F203C00, + FPCompareMask = 0xFFE0FC1F, + FCMP_s = FPCompareFixed | 0x00000000, + FCMP_d = FPCompareFixed | FP64 | 0x00000000, + FCMP = FCMP_s, + FCMP_s_zero = FPCompareFixed | 0x00000008, + FCMP_d_zero = FPCompareFixed | FP64 | 0x00000008, + FCMP_zero = FCMP_s_zero, + FCMPE_s = FPCompareFixed | 0x00000010, + FCMPE_d = FPCompareFixed | FP64 | 0x00000010, + FCMPE_s_zero = FPCompareFixed | 0x00000018, + FCMPE_d_zero = FPCompareFixed | FP64 | 0x00000018 +}; + +// Floating point conditional compare. +enum FPConditionalCompareOp { + FPConditionalCompareFixed = 0x1E200400, + FPConditionalCompareFMask = 0x5F200C00, + FPConditionalCompareMask = 0xFFE00C10, + FCCMP_s = FPConditionalCompareFixed | 0x00000000, + FCCMP_d = FPConditionalCompareFixed | FP64 | 0x00000000, + FCCMP = FCCMP_s, + FCCMPE_s = FPConditionalCompareFixed | 0x00000010, + FCCMPE_d = FPConditionalCompareFixed | FP64 | 0x00000010, + FCCMPE = FCCMPE_s +}; + +// Floating point conditional select. +enum FPConditionalSelectOp { + FPConditionalSelectFixed = 0x1E200C00, + FPConditionalSelectFMask = 0x5F200C00, + FPConditionalSelectMask = 0xFFE00C00, + FCSEL_s = FPConditionalSelectFixed | 0x00000000, + FCSEL_d = FPConditionalSelectFixed | FP64 | 0x00000000, + FCSEL = FCSEL_s +}; + +// Floating point immediate. +enum FPImmediateOp { + FPImmediateFixed = 0x1E201000, + FPImmediateFMask = 0x5F201C00, + FPImmediateMask = 0xFFE01C00, + FMOV_s_imm = FPImmediateFixed | 0x00000000, + FMOV_d_imm = FPImmediateFixed | FP64 | 0x00000000 +}; + +// Floating point data processing 1 source. +enum FPDataProcessing1SourceOp { + FPDataProcessing1SourceFixed = 0x1E204000, + FPDataProcessing1SourceFMask = 0x5F207C00, + FPDataProcessing1SourceMask = 0xFFFFFC00, + FMOV_s = FPDataProcessing1SourceFixed | 0x00000000, + FMOV_d = FPDataProcessing1SourceFixed | FP64 | 0x00000000, + FMOV = FMOV_s, + FABS_s = FPDataProcessing1SourceFixed | 0x00008000, + FABS_d = FPDataProcessing1SourceFixed | FP64 | 0x00008000, + FABS = FABS_s, + FNEG_s = FPDataProcessing1SourceFixed | 0x00010000, + FNEG_d = FPDataProcessing1SourceFixed | FP64 | 0x00010000, + FNEG = FNEG_s, + FSQRT_s = FPDataProcessing1SourceFixed | 0x00018000, + FSQRT_d = FPDataProcessing1SourceFixed | FP64 | 0x00018000, + FSQRT = FSQRT_s, + FCVT_ds = FPDataProcessing1SourceFixed | 0x00028000, + FCVT_sd = FPDataProcessing1SourceFixed | FP64 | 0x00020000, + FRINTN_s = FPDataProcessing1SourceFixed | 0x00040000, + FRINTN_d = FPDataProcessing1SourceFixed | FP64 | 0x00040000, + FRINTN = FRINTN_s, + FRINTP_s = FPDataProcessing1SourceFixed | 0x00048000, + FRINTP_d = FPDataProcessing1SourceFixed | FP64 | 0x00048000, + FRINTP = FRINTP_s, + FRINTM_s = FPDataProcessing1SourceFixed | 0x00050000, + FRINTM_d = FPDataProcessing1SourceFixed | FP64 | 0x00050000, + FRINTM = FRINTM_s, + FRINTZ_s = FPDataProcessing1SourceFixed | 0x00058000, + FRINTZ_d = FPDataProcessing1SourceFixed | FP64 | 0x00058000, + FRINTZ = FRINTZ_s, + FRINTA_s = FPDataProcessing1SourceFixed | 0x00060000, + FRINTA_d = FPDataProcessing1SourceFixed | FP64 | 0x00060000, + FRINTA = FRINTA_s, + FRINTX_s = FPDataProcessing1SourceFixed | 0x00070000, + FRINTX_d = FPDataProcessing1SourceFixed | FP64 | 0x00070000, + FRINTX = FRINTX_s, + FRINTI_s = FPDataProcessing1SourceFixed | 0x00078000, + FRINTI_d = FPDataProcessing1SourceFixed | FP64 | 0x00078000, + FRINTI = FRINTI_s +}; + +// Floating point data processing 2 source. +enum FPDataProcessing2SourceOp { + FPDataProcessing2SourceFixed = 0x1E200800, + FPDataProcessing2SourceFMask = 0x5F200C00, + FPDataProcessing2SourceMask = 0xFFE0FC00, + FMUL = FPDataProcessing2SourceFixed | 0x00000000, + FMUL_s = FMUL, + FMUL_d = FMUL | FP64, + FDIV = FPDataProcessing2SourceFixed | 0x00001000, + FDIV_s = FDIV, + FDIV_d = FDIV | FP64, + FADD = FPDataProcessing2SourceFixed | 0x00002000, + FADD_s = FADD, + FADD_d = FADD | FP64, + FSUB = FPDataProcessing2SourceFixed | 0x00003000, + FSUB_s = FSUB, + FSUB_d = FSUB | FP64, + FMAX = FPDataProcessing2SourceFixed | 0x00004000, + FMAX_s = FMAX, + FMAX_d = FMAX | FP64, + FMIN = FPDataProcessing2SourceFixed | 0x00005000, + FMIN_s = FMIN, + FMIN_d = FMIN | FP64, + FMAXNM = FPDataProcessing2SourceFixed | 0x00006000, + FMAXNM_s = FMAXNM, + FMAXNM_d = FMAXNM | FP64, + FMINNM = FPDataProcessing2SourceFixed | 0x00007000, + FMINNM_s = FMINNM, + FMINNM_d = FMINNM | FP64, + FNMUL = FPDataProcessing2SourceFixed | 0x00008000, + FNMUL_s = FNMUL, + FNMUL_d = FNMUL | FP64 +}; + +// Floating point data processing 3 source. +enum FPDataProcessing3SourceOp { + FPDataProcessing3SourceFixed = 0x1F000000, + FPDataProcessing3SourceFMask = 0x5F000000, + FPDataProcessing3SourceMask = 0xFFE08000, + FMADD_s = FPDataProcessing3SourceFixed | 0x00000000, + FMSUB_s = FPDataProcessing3SourceFixed | 0x00008000, + FNMADD_s = FPDataProcessing3SourceFixed | 0x00200000, + FNMSUB_s = FPDataProcessing3SourceFixed | 0x00208000, + FMADD_d = FPDataProcessing3SourceFixed | 0x00400000, + FMSUB_d = FPDataProcessing3SourceFixed | 0x00408000, + FNMADD_d = FPDataProcessing3SourceFixed | 0x00600000, + FNMSUB_d = FPDataProcessing3SourceFixed | 0x00608000 +}; + +// Conversion between floating point and integer. +enum FPIntegerConvertOp { + FPIntegerConvertFixed = 0x1E200000, + FPIntegerConvertFMask = 0x5F20FC00, + FPIntegerConvertMask = 0xFFFFFC00, + FCVTNS = FPIntegerConvertFixed | 0x00000000, + FCVTNS_ws = FCVTNS, + FCVTNS_xs = FCVTNS | SixtyFourBits, + FCVTNS_wd = FCVTNS | FP64, + FCVTNS_xd = FCVTNS | SixtyFourBits | FP64, + FCVTNU = FPIntegerConvertFixed | 0x00010000, + FCVTNU_ws = FCVTNU, + FCVTNU_xs = FCVTNU | SixtyFourBits, + FCVTNU_wd = FCVTNU | FP64, + FCVTNU_xd = FCVTNU | SixtyFourBits | FP64, + FCVTPS = FPIntegerConvertFixed | 0x00080000, + FCVTPS_ws = FCVTPS, + FCVTPS_xs = FCVTPS | SixtyFourBits, + FCVTPS_wd = FCVTPS | FP64, + FCVTPS_xd = FCVTPS | SixtyFourBits | FP64, + FCVTPU = FPIntegerConvertFixed | 0x00090000, + FCVTPU_ws = FCVTPU, + FCVTPU_xs = FCVTPU | SixtyFourBits, + FCVTPU_wd = FCVTPU | FP64, + FCVTPU_xd = FCVTPU | SixtyFourBits | FP64, + FCVTMS = FPIntegerConvertFixed | 0x00100000, + FCVTMS_ws = FCVTMS, + FCVTMS_xs = FCVTMS | SixtyFourBits, + FCVTMS_wd = FCVTMS | FP64, + FCVTMS_xd = FCVTMS | SixtyFourBits | FP64, + FCVTMU = FPIntegerConvertFixed | 0x00110000, + FCVTMU_ws = FCVTMU, + FCVTMU_xs = FCVTMU | SixtyFourBits, + FCVTMU_wd = FCVTMU | FP64, + FCVTMU_xd = FCVTMU | SixtyFourBits | FP64, + FCVTZS = FPIntegerConvertFixed | 0x00180000, + FCVTZS_ws = FCVTZS, + FCVTZS_xs = FCVTZS | SixtyFourBits, + FCVTZS_wd = FCVTZS | FP64, + FCVTZS_xd = FCVTZS | SixtyFourBits | FP64, + FCVTZU = FPIntegerConvertFixed | 0x00190000, + FCVTZU_ws = FCVTZU, + FCVTZU_xs = FCVTZU | SixtyFourBits, + FCVTZU_wd = FCVTZU | FP64, + FCVTZU_xd = FCVTZU | SixtyFourBits | FP64, + SCVTF = FPIntegerConvertFixed | 0x00020000, + SCVTF_sw = SCVTF, + SCVTF_sx = SCVTF | SixtyFourBits, + SCVTF_dw = SCVTF | FP64, + SCVTF_dx = SCVTF | SixtyFourBits | FP64, + UCVTF = FPIntegerConvertFixed | 0x00030000, + UCVTF_sw = UCVTF, + UCVTF_sx = UCVTF | SixtyFourBits, + UCVTF_dw = UCVTF | FP64, + UCVTF_dx = UCVTF | SixtyFourBits | FP64, + FCVTAS = FPIntegerConvertFixed | 0x00040000, + FCVTAS_ws = FCVTAS, + FCVTAS_xs = FCVTAS | SixtyFourBits, + FCVTAS_wd = FCVTAS | FP64, + FCVTAS_xd = FCVTAS | SixtyFourBits | FP64, + FCVTAU = FPIntegerConvertFixed | 0x00050000, + FCVTAU_ws = FCVTAU, + FCVTAU_xs = FCVTAU | SixtyFourBits, + FCVTAU_wd = FCVTAU | FP64, + FCVTAU_xd = FCVTAU | SixtyFourBits | FP64, + FMOV_ws = FPIntegerConvertFixed | 0x00060000, + FMOV_sw = FPIntegerConvertFixed | 0x00070000, + FMOV_xd = FMOV_ws | SixtyFourBits | FP64, + FMOV_dx = FMOV_sw | SixtyFourBits | FP64 +}; + +// Conversion between fixed point and floating point. +enum FPFixedPointConvertOp { + FPFixedPointConvertFixed = 0x1E000000, + FPFixedPointConvertFMask = 0x5F200000, + FPFixedPointConvertMask = 0xFFFF0000, + FCVTZS_fixed = FPFixedPointConvertFixed | 0x00180000, + FCVTZS_ws_fixed = FCVTZS_fixed, + FCVTZS_xs_fixed = FCVTZS_fixed | SixtyFourBits, + FCVTZS_wd_fixed = FCVTZS_fixed | FP64, + FCVTZS_xd_fixed = FCVTZS_fixed | SixtyFourBits | FP64, + FCVTZU_fixed = FPFixedPointConvertFixed | 0x00190000, + FCVTZU_ws_fixed = FCVTZU_fixed, + FCVTZU_xs_fixed = FCVTZU_fixed | SixtyFourBits, + FCVTZU_wd_fixed = FCVTZU_fixed | FP64, + FCVTZU_xd_fixed = FCVTZU_fixed | SixtyFourBits | FP64, + SCVTF_fixed = FPFixedPointConvertFixed | 0x00020000, + SCVTF_sw_fixed = SCVTF_fixed, + SCVTF_sx_fixed = SCVTF_fixed | SixtyFourBits, + SCVTF_dw_fixed = SCVTF_fixed | FP64, + SCVTF_dx_fixed = SCVTF_fixed | SixtyFourBits | FP64, + UCVTF_fixed = FPFixedPointConvertFixed | 0x00030000, + UCVTF_sw_fixed = UCVTF_fixed, + UCVTF_sx_fixed = UCVTF_fixed | SixtyFourBits, + UCVTF_dw_fixed = UCVTF_fixed | FP64, + UCVTF_dx_fixed = UCVTF_fixed | SixtyFourBits | FP64 +}; + +// Unimplemented and unallocated instructions. These are defined to make fixed +// bit assertion easier. +enum UnimplementedOp { + UnimplementedFixed = 0x00000000, + UnimplementedFMask = 0x00000000 +}; + +enum UnallocatedOp { + UnallocatedFixed = 0x00000000, + UnallocatedFMask = 0x00000000 +}; + +} // namespace vixl + +#endif // VIXL_A64_CONSTANTS_A64_H_ diff --git a/disas/libvixl/a64/cpu-a64.h b/disas/libvixl/a64/cpu-a64.h new file mode 100644 index 0000000000..dfd8f015cf --- /dev/null +++ b/disas/libvixl/a64/cpu-a64.h @@ -0,0 +1,56 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_CPU_A64_H +#define VIXL_CPU_A64_H + +#include "globals.h" + +namespace vixl { + +class CPU { + public: + // Initialise CPU support. + static void SetUp(); + + // Ensures the data at a given address and with a given size is the same for + // the I and D caches. I and D caches are not automatically coherent on ARM + // so this operation is required before any dynamically generated code can + // safely run. + static void EnsureIAndDCacheCoherency(void *address, size_t length); + + private: + // Return the content of the cache type register. + static uint32_t GetCacheType(); + + // I and D cache line size in bytes. + static unsigned icache_line_size_; + static unsigned dcache_line_size_; +}; + +} // namespace vixl + +#endif // VIXL_CPU_A64_H diff --git a/disas/libvixl/a64/decoder-a64.cc b/disas/libvixl/a64/decoder-a64.cc new file mode 100644 index 0000000000..8450eb3b49 --- /dev/null +++ b/disas/libvixl/a64/decoder-a64.cc @@ -0,0 +1,712 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "globals.h" +#include "utils.h" +#include "a64/decoder-a64.h" + +namespace vixl { +// Top-level instruction decode function. +void Decoder::Decode(Instruction *instr) { + if (instr->Bits(28, 27) == 0) { + VisitUnallocated(instr); + } else { + switch (instr->Bits(27, 24)) { + // 0: PC relative addressing. + case 0x0: DecodePCRelAddressing(instr); break; + + // 1: Add/sub immediate. + case 0x1: DecodeAddSubImmediate(instr); break; + + // A: Logical shifted register. + // Add/sub with carry. + // Conditional compare register. + // Conditional compare immediate. + // Conditional select. + // Data processing 1 source. + // Data processing 2 source. + // B: Add/sub shifted register. + // Add/sub extended register. + // Data processing 3 source. + case 0xA: + case 0xB: DecodeDataProcessing(instr); break; + + // 2: Logical immediate. + // Move wide immediate. + case 0x2: DecodeLogical(instr); break; + + // 3: Bitfield. + // Extract. + case 0x3: DecodeBitfieldExtract(instr); break; + + // 4: Unconditional branch immediate. + // Exception generation. + // Compare and branch immediate. + // 5: Compare and branch immediate. + // Conditional branch. + // System. + // 6,7: Unconditional branch. + // Test and branch immediate. + case 0x4: + case 0x5: + case 0x6: + case 0x7: DecodeBranchSystemException(instr); break; + + // 8,9: Load/store register pair post-index. + // Load register literal. + // Load/store register unscaled immediate. + // Load/store register immediate post-index. + // Load/store register immediate pre-index. + // Load/store register offset. + // Load/store exclusive. + // C,D: Load/store register pair offset. + // Load/store register pair pre-index. + // Load/store register unsigned immediate. + // Advanced SIMD. + case 0x8: + case 0x9: + case 0xC: + case 0xD: DecodeLoadStore(instr); break; + + // E: FP fixed point conversion. + // FP integer conversion. + // FP data processing 1 source. + // FP compare. + // FP immediate. + // FP data processing 2 source. + // FP conditional compare. + // FP conditional select. + // Advanced SIMD. + // F: FP data processing 3 source. + // Advanced SIMD. + case 0xE: + case 0xF: DecodeFP(instr); break; + } + } +} + +void Decoder::AppendVisitor(DecoderVisitor* new_visitor) { + visitors_.remove(new_visitor); + visitors_.push_front(new_visitor); +} + + +void Decoder::PrependVisitor(DecoderVisitor* new_visitor) { + visitors_.remove(new_visitor); + visitors_.push_back(new_visitor); +} + + +void Decoder::InsertVisitorBefore(DecoderVisitor* new_visitor, + DecoderVisitor* registered_visitor) { + visitors_.remove(new_visitor); + std::list::iterator it; + for (it = visitors_.begin(); it != visitors_.end(); it++) { + if (*it == registered_visitor) { + visitors_.insert(it, new_visitor); + return; + } + } + // We reached the end of the list. The last element must be + // registered_visitor. + VIXL_ASSERT(*it == registered_visitor); + visitors_.insert(it, new_visitor); +} + + +void Decoder::InsertVisitorAfter(DecoderVisitor* new_visitor, + DecoderVisitor* registered_visitor) { + visitors_.remove(new_visitor); + std::list::iterator it; + for (it = visitors_.begin(); it != visitors_.end(); it++) { + if (*it == registered_visitor) { + it++; + visitors_.insert(it, new_visitor); + return; + } + } + // We reached the end of the list. The last element must be + // registered_visitor. + VIXL_ASSERT(*it == registered_visitor); + visitors_.push_back(new_visitor); +} + + +void Decoder::RemoveVisitor(DecoderVisitor* visitor) { + visitors_.remove(visitor); +} + + +void Decoder::DecodePCRelAddressing(Instruction* instr) { + VIXL_ASSERT(instr->Bits(27, 24) == 0x0); + // We know bit 28 is set, as = 0 is filtered out at the top level + // decode. + VIXL_ASSERT(instr->Bit(28) == 0x1); + VisitPCRelAddressing(instr); +} + + +void Decoder::DecodeBranchSystemException(Instruction* instr) { + VIXL_ASSERT((instr->Bits(27, 24) == 0x4) || + (instr->Bits(27, 24) == 0x5) || + (instr->Bits(27, 24) == 0x6) || + (instr->Bits(27, 24) == 0x7) ); + + switch (instr->Bits(31, 29)) { + case 0: + case 4: { + VisitUnconditionalBranch(instr); + break; + } + case 1: + case 5: { + if (instr->Bit(25) == 0) { + VisitCompareBranch(instr); + } else { + VisitTestBranch(instr); + } + break; + } + case 2: { + if (instr->Bit(25) == 0) { + if ((instr->Bit(24) == 0x1) || + (instr->Mask(0x01000010) == 0x00000010)) { + VisitUnallocated(instr); + } else { + VisitConditionalBranch(instr); + } + } else { + VisitUnallocated(instr); + } + break; + } + case 6: { + if (instr->Bit(25) == 0) { + if (instr->Bit(24) == 0) { + if ((instr->Bits(4, 2) != 0) || + (instr->Mask(0x00E0001D) == 0x00200001) || + (instr->Mask(0x00E0001D) == 0x00400001) || + (instr->Mask(0x00E0001E) == 0x00200002) || + (instr->Mask(0x00E0001E) == 0x00400002) || + (instr->Mask(0x00E0001C) == 0x00600000) || + (instr->Mask(0x00E0001C) == 0x00800000) || + (instr->Mask(0x00E0001F) == 0x00A00000) || + (instr->Mask(0x00C0001C) == 0x00C00000)) { + VisitUnallocated(instr); + } else { + VisitException(instr); + } + } else { + if (instr->Bits(23, 22) == 0) { + const Instr masked_003FF0E0 = instr->Mask(0x003FF0E0); + if ((instr->Bits(21, 19) == 0x4) || + (masked_003FF0E0 == 0x00033000) || + (masked_003FF0E0 == 0x003FF020) || + (masked_003FF0E0 == 0x003FF060) || + (masked_003FF0E0 == 0x003FF0E0) || + (instr->Mask(0x00388000) == 0x00008000) || + (instr->Mask(0x0038E000) == 0x00000000) || + (instr->Mask(0x0039E000) == 0x00002000) || + (instr->Mask(0x003AE000) == 0x00002000) || + (instr->Mask(0x003CE000) == 0x00042000) || + (instr->Mask(0x003FFFC0) == 0x000320C0) || + (instr->Mask(0x003FF100) == 0x00032100) || + (instr->Mask(0x003FF200) == 0x00032200) || + (instr->Mask(0x003FF400) == 0x00032400) || + (instr->Mask(0x003FF800) == 0x00032800) || + (instr->Mask(0x0038F000) == 0x00005000) || + (instr->Mask(0x0038E000) == 0x00006000)) { + VisitUnallocated(instr); + } else { + VisitSystem(instr); + } + } else { + VisitUnallocated(instr); + } + } + } else { + if ((instr->Bit(24) == 0x1) || + (instr->Bits(20, 16) != 0x1F) || + (instr->Bits(15, 10) != 0) || + (instr->Bits(4, 0) != 0) || + (instr->Bits(24, 21) == 0x3) || + (instr->Bits(24, 22) == 0x3)) { + VisitUnallocated(instr); + } else { + VisitUnconditionalBranchToRegister(instr); + } + } + break; + } + case 3: + case 7: { + VisitUnallocated(instr); + break; + } + } +} + + +void Decoder::DecodeLoadStore(Instruction* instr) { + VIXL_ASSERT((instr->Bits(27, 24) == 0x8) || + (instr->Bits(27, 24) == 0x9) || + (instr->Bits(27, 24) == 0xC) || + (instr->Bits(27, 24) == 0xD) ); + + if (instr->Bit(24) == 0) { + if (instr->Bit(28) == 0) { + if (instr->Bit(29) == 0) { + if (instr->Bit(26) == 0) { + // TODO: VisitLoadStoreExclusive. + VisitUnimplemented(instr); + } else { + DecodeAdvSIMDLoadStore(instr); + } + } else { + if ((instr->Bits(31, 30) == 0x3) || + (instr->Mask(0xC4400000) == 0x40000000)) { + VisitUnallocated(instr); + } else { + if (instr->Bit(23) == 0) { + if (instr->Mask(0xC4400000) == 0xC0400000) { + VisitUnallocated(instr); + } else { + VisitLoadStorePairNonTemporal(instr); + } + } else { + VisitLoadStorePairPostIndex(instr); + } + } + } + } else { + if (instr->Bit(29) == 0) { + if (instr->Mask(0xC4000000) == 0xC4000000) { + VisitUnallocated(instr); + } else { + VisitLoadLiteral(instr); + } + } else { + if ((instr->Mask(0x84C00000) == 0x80C00000) || + (instr->Mask(0x44800000) == 0x44800000) || + (instr->Mask(0x84800000) == 0x84800000)) { + VisitUnallocated(instr); + } else { + if (instr->Bit(21) == 0) { + switch (instr->Bits(11, 10)) { + case 0: { + VisitLoadStoreUnscaledOffset(instr); + break; + } + case 1: { + if (instr->Mask(0xC4C00000) == 0xC0800000) { + VisitUnallocated(instr); + } else { + VisitLoadStorePostIndex(instr); + } + break; + } + case 2: { + // TODO: VisitLoadStoreRegisterOffsetUnpriv. + VisitUnimplemented(instr); + break; + } + case 3: { + if (instr->Mask(0xC4C00000) == 0xC0800000) { + VisitUnallocated(instr); + } else { + VisitLoadStorePreIndex(instr); + } + break; + } + } + } else { + if (instr->Bits(11, 10) == 0x2) { + if (instr->Bit(14) == 0) { + VisitUnallocated(instr); + } else { + VisitLoadStoreRegisterOffset(instr); + } + } else { + VisitUnallocated(instr); + } + } + } + } + } + } else { + if (instr->Bit(28) == 0) { + if (instr->Bit(29) == 0) { + VisitUnallocated(instr); + } else { + if ((instr->Bits(31, 30) == 0x3) || + (instr->Mask(0xC4400000) == 0x40000000)) { + VisitUnallocated(instr); + } else { + if (instr->Bit(23) == 0) { + VisitLoadStorePairOffset(instr); + } else { + VisitLoadStorePairPreIndex(instr); + } + } + } + } else { + if (instr->Bit(29) == 0) { + VisitUnallocated(instr); + } else { + if ((instr->Mask(0x84C00000) == 0x80C00000) || + (instr->Mask(0x44800000) == 0x44800000) || + (instr->Mask(0x84800000) == 0x84800000)) { + VisitUnallocated(instr); + } else { + VisitLoadStoreUnsignedOffset(instr); + } + } + } + } +} + + +void Decoder::DecodeLogical(Instruction* instr) { + VIXL_ASSERT(instr->Bits(27, 24) == 0x2); + + if (instr->Mask(0x80400000) == 0x00400000) { + VisitUnallocated(instr); + } else { + if (instr->Bit(23) == 0) { + VisitLogicalImmediate(instr); + } else { + if (instr->Bits(30, 29) == 0x1) { + VisitUnallocated(instr); + } else { + VisitMoveWideImmediate(instr); + } + } + } +} + + +void Decoder::DecodeBitfieldExtract(Instruction* instr) { + VIXL_ASSERT(instr->Bits(27, 24) == 0x3); + + if ((instr->Mask(0x80400000) == 0x80000000) || + (instr->Mask(0x80400000) == 0x00400000) || + (instr->Mask(0x80008000) == 0x00008000)) { + VisitUnallocated(instr); + } else if (instr->Bit(23) == 0) { + if ((instr->Mask(0x80200000) == 0x00200000) || + (instr->Mask(0x60000000) == 0x60000000)) { + VisitUnallocated(instr); + } else { + VisitBitfield(instr); + } + } else { + if ((instr->Mask(0x60200000) == 0x00200000) || + (instr->Mask(0x60000000) != 0x00000000)) { + VisitUnallocated(instr); + } else { + VisitExtract(instr); + } + } +} + + +void Decoder::DecodeAddSubImmediate(Instruction* instr) { + VIXL_ASSERT(instr->Bits(27, 24) == 0x1); + if (instr->Bit(23) == 1) { + VisitUnallocated(instr); + } else { + VisitAddSubImmediate(instr); + } +} + + +void Decoder::DecodeDataProcessing(Instruction* instr) { + VIXL_ASSERT((instr->Bits(27, 24) == 0xA) || + (instr->Bits(27, 24) == 0xB)); + + if (instr->Bit(24) == 0) { + if (instr->Bit(28) == 0) { + if (instr->Mask(0x80008000) == 0x00008000) { + VisitUnallocated(instr); + } else { + VisitLogicalShifted(instr); + } + } else { + switch (instr->Bits(23, 21)) { + case 0: { + if (instr->Mask(0x0000FC00) != 0) { + VisitUnallocated(instr); + } else { + VisitAddSubWithCarry(instr); + } + break; + } + case 2: { + if ((instr->Bit(29) == 0) || + (instr->Mask(0x00000410) != 0)) { + VisitUnallocated(instr); + } else { + if (instr->Bit(11) == 0) { + VisitConditionalCompareRegister(instr); + } else { + VisitConditionalCompareImmediate(instr); + } + } + break; + } + case 4: { + if (instr->Mask(0x20000800) != 0x00000000) { + VisitUnallocated(instr); + } else { + VisitConditionalSelect(instr); + } + break; + } + case 6: { + if (instr->Bit(29) == 0x1) { + VisitUnallocated(instr); + } else { + if (instr->Bit(30) == 0) { + if ((instr->Bit(15) == 0x1) || + (instr->Bits(15, 11) == 0) || + (instr->Bits(15, 12) == 0x1) || + (instr->Bits(15, 12) == 0x3) || + (instr->Bits(15, 13) == 0x3) || + (instr->Mask(0x8000EC00) == 0x00004C00) || + (instr->Mask(0x8000E800) == 0x80004000) || + (instr->Mask(0x8000E400) == 0x80004000)) { + VisitUnallocated(instr); + } else { + VisitDataProcessing2Source(instr); + } + } else { + if ((instr->Bit(13) == 1) || + (instr->Bits(20, 16) != 0) || + (instr->Bits(15, 14) != 0) || + (instr->Mask(0xA01FFC00) == 0x00000C00) || + (instr->Mask(0x201FF800) == 0x00001800)) { + VisitUnallocated(instr); + } else { + VisitDataProcessing1Source(instr); + } + } + break; + } + } + case 1: + case 3: + case 5: + case 7: VisitUnallocated(instr); break; + } + } + } else { + if (instr->Bit(28) == 0) { + if (instr->Bit(21) == 0) { + if ((instr->Bits(23, 22) == 0x3) || + (instr->Mask(0x80008000) == 0x00008000)) { + VisitUnallocated(instr); + } else { + VisitAddSubShifted(instr); + } + } else { + if ((instr->Mask(0x00C00000) != 0x00000000) || + (instr->Mask(0x00001400) == 0x00001400) || + (instr->Mask(0x00001800) == 0x00001800)) { + VisitUnallocated(instr); + } else { + VisitAddSubExtended(instr); + } + } + } else { + if ((instr->Bit(30) == 0x1) || + (instr->Bits(30, 29) == 0x1) || + (instr->Mask(0xE0600000) == 0x00200000) || + (instr->Mask(0xE0608000) == 0x00400000) || + (instr->Mask(0x60608000) == 0x00408000) || + (instr->Mask(0x60E00000) == 0x00E00000) || + (instr->Mask(0x60E00000) == 0x00800000) || + (instr->Mask(0x60E00000) == 0x00600000)) { + VisitUnallocated(instr); + } else { + VisitDataProcessing3Source(instr); + } + } + } +} + + +void Decoder::DecodeFP(Instruction* instr) { + VIXL_ASSERT((instr->Bits(27, 24) == 0xE) || + (instr->Bits(27, 24) == 0xF)); + + if (instr->Bit(28) == 0) { + DecodeAdvSIMDDataProcessing(instr); + } else { + if (instr->Bit(29) == 1) { + VisitUnallocated(instr); + } else { + if (instr->Bits(31, 30) == 0x3) { + VisitUnallocated(instr); + } else if (instr->Bits(31, 30) == 0x1) { + DecodeAdvSIMDDataProcessing(instr); + } else { + if (instr->Bit(24) == 0) { + if (instr->Bit(21) == 0) { + if ((instr->Bit(23) == 1) || + (instr->Bit(18) == 1) || + (instr->Mask(0x80008000) == 0x00000000) || + (instr->Mask(0x000E0000) == 0x00000000) || + (instr->Mask(0x000E0000) == 0x000A0000) || + (instr->Mask(0x00160000) == 0x00000000) || + (instr->Mask(0x00160000) == 0x00120000)) { + VisitUnallocated(instr); + } else { + VisitFPFixedPointConvert(instr); + } + } else { + if (instr->Bits(15, 10) == 32) { + VisitUnallocated(instr); + } else if (instr->Bits(15, 10) == 0) { + if ((instr->Bits(23, 22) == 0x3) || + (instr->Mask(0x000E0000) == 0x000A0000) || + (instr->Mask(0x000E0000) == 0x000C0000) || + (instr->Mask(0x00160000) == 0x00120000) || + (instr->Mask(0x00160000) == 0x00140000) || + (instr->Mask(0x20C40000) == 0x00800000) || + (instr->Mask(0x20C60000) == 0x00840000) || + (instr->Mask(0xA0C60000) == 0x80060000) || + (instr->Mask(0xA0C60000) == 0x00860000) || + (instr->Mask(0xA0C60000) == 0x00460000) || + (instr->Mask(0xA0CE0000) == 0x80860000) || + (instr->Mask(0xA0CE0000) == 0x804E0000) || + (instr->Mask(0xA0CE0000) == 0x000E0000) || + (instr->Mask(0xA0D60000) == 0x00160000) || + (instr->Mask(0xA0D60000) == 0x80560000) || + (instr->Mask(0xA0D60000) == 0x80960000)) { + VisitUnallocated(instr); + } else { + VisitFPIntegerConvert(instr); + } + } else if (instr->Bits(14, 10) == 16) { + const Instr masked_A0DF8000 = instr->Mask(0xA0DF8000); + if ((instr->Mask(0x80180000) != 0) || + (masked_A0DF8000 == 0x00020000) || + (masked_A0DF8000 == 0x00030000) || + (masked_A0DF8000 == 0x00068000) || + (masked_A0DF8000 == 0x00428000) || + (masked_A0DF8000 == 0x00430000) || + (masked_A0DF8000 == 0x00468000) || + (instr->Mask(0xA0D80000) == 0x00800000) || + (instr->Mask(0xA0DE0000) == 0x00C00000) || + (instr->Mask(0xA0DF0000) == 0x00C30000) || + (instr->Mask(0xA0DC0000) == 0x00C40000)) { + VisitUnallocated(instr); + } else { + VisitFPDataProcessing1Source(instr); + } + } else if (instr->Bits(13, 10) == 8) { + if ((instr->Bits(15, 14) != 0) || + (instr->Bits(2, 0) != 0) || + (instr->Mask(0x80800000) != 0x00000000)) { + VisitUnallocated(instr); + } else { + VisitFPCompare(instr); + } + } else if (instr->Bits(12, 10) == 4) { + if ((instr->Bits(9, 5) != 0) || + (instr->Mask(0x80800000) != 0x00000000)) { + VisitUnallocated(instr); + } else { + VisitFPImmediate(instr); + } + } else { + if (instr->Mask(0x80800000) != 0x00000000) { + VisitUnallocated(instr); + } else { + switch (instr->Bits(11, 10)) { + case 1: { + VisitFPConditionalCompare(instr); + break; + } + case 2: { + if ((instr->Bits(15, 14) == 0x3) || + (instr->Mask(0x00009000) == 0x00009000) || + (instr->Mask(0x0000A000) == 0x0000A000)) { + VisitUnallocated(instr); + } else { + VisitFPDataProcessing2Source(instr); + } + break; + } + case 3: { + VisitFPConditionalSelect(instr); + break; + } + default: VIXL_UNREACHABLE(); + } + } + } + } + } else { + // Bit 30 == 1 has been handled earlier. + VIXL_ASSERT(instr->Bit(30) == 0); + if (instr->Mask(0xA0800000) != 0) { + VisitUnallocated(instr); + } else { + VisitFPDataProcessing3Source(instr); + } + } + } + } + } +} + + +void Decoder::DecodeAdvSIMDLoadStore(Instruction* instr) { + // TODO: Implement Advanced SIMD load/store instruction decode. + VIXL_ASSERT(instr->Bits(29, 25) == 0x6); + VisitUnimplemented(instr); +} + + +void Decoder::DecodeAdvSIMDDataProcessing(Instruction* instr) { + // TODO: Implement Advanced SIMD data processing instruction decode. + VIXL_ASSERT(instr->Bits(27, 25) == 0x7); + VisitUnimplemented(instr); +} + + +#define DEFINE_VISITOR_CALLERS(A) \ + void Decoder::Visit##A(Instruction *instr) { \ + VIXL_ASSERT(instr->Mask(A##FMask) == A##Fixed); \ + std::list::iterator it; \ + for (it = visitors_.begin(); it != visitors_.end(); it++) { \ + (*it)->Visit##A(instr); \ + } \ + } +VISITOR_LIST(DEFINE_VISITOR_CALLERS) +#undef DEFINE_VISITOR_CALLERS +} // namespace vixl diff --git a/disas/libvixl/a64/decoder-a64.h b/disas/libvixl/a64/decoder-a64.h new file mode 100644 index 0000000000..bbbbd81247 --- /dev/null +++ b/disas/libvixl/a64/decoder-a64.h @@ -0,0 +1,198 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_A64_DECODER_A64_H_ +#define VIXL_A64_DECODER_A64_H_ + +#include + +#include "globals.h" +#include "a64/instructions-a64.h" + + +// List macro containing all visitors needed by the decoder class. + +#define VISITOR_LIST(V) \ + V(PCRelAddressing) \ + V(AddSubImmediate) \ + V(LogicalImmediate) \ + V(MoveWideImmediate) \ + V(Bitfield) \ + V(Extract) \ + V(UnconditionalBranch) \ + V(UnconditionalBranchToRegister) \ + V(CompareBranch) \ + V(TestBranch) \ + V(ConditionalBranch) \ + V(System) \ + V(Exception) \ + V(LoadStorePairPostIndex) \ + V(LoadStorePairOffset) \ + V(LoadStorePairPreIndex) \ + V(LoadStorePairNonTemporal) \ + V(LoadLiteral) \ + V(LoadStoreUnscaledOffset) \ + V(LoadStorePostIndex) \ + V(LoadStorePreIndex) \ + V(LoadStoreRegisterOffset) \ + V(LoadStoreUnsignedOffset) \ + V(LogicalShifted) \ + V(AddSubShifted) \ + V(AddSubExtended) \ + V(AddSubWithCarry) \ + V(ConditionalCompareRegister) \ + V(ConditionalCompareImmediate) \ + V(ConditionalSelect) \ + V(DataProcessing1Source) \ + V(DataProcessing2Source) \ + V(DataProcessing3Source) \ + V(FPCompare) \ + V(FPConditionalCompare) \ + V(FPConditionalSelect) \ + V(FPImmediate) \ + V(FPDataProcessing1Source) \ + V(FPDataProcessing2Source) \ + V(FPDataProcessing3Source) \ + V(FPIntegerConvert) \ + V(FPFixedPointConvert) \ + V(Unallocated) \ + V(Unimplemented) + +namespace vixl { + +// The Visitor interface. Disassembler and simulator (and other tools) +// must provide implementations for all of these functions. +class DecoderVisitor { + public: + #define DECLARE(A) virtual void Visit##A(Instruction* instr) = 0; + VISITOR_LIST(DECLARE) + #undef DECLARE + + virtual ~DecoderVisitor() {} + + private: + // Visitors are registered in a list. + std::list visitors_; + + friend class Decoder; +}; + + +class Decoder: public DecoderVisitor { + public: + Decoder() {} + + // Top-level instruction decoder function. Decodes an instruction and calls + // the visitor functions registered with the Decoder class. + void Decode(Instruction *instr); + + // Register a new visitor class with the decoder. + // Decode() will call the corresponding visitor method from all registered + // visitor classes when decoding reaches the leaf node of the instruction + // decode tree. + // Visitors are called in the order. + // A visitor can only be registered once. + // Registering an already registered visitor will update its position. + // + // d.AppendVisitor(V1); + // d.AppendVisitor(V2); + // d.PrependVisitor(V2); // Move V2 at the start of the list. + // d.InsertVisitorBefore(V3, V2); + // d.AppendVisitor(V4); + // d.AppendVisitor(V4); // No effect. + // + // d.Decode(i); + // + // will call in order visitor methods in V3, V2, V1, V4. + void AppendVisitor(DecoderVisitor* visitor); + void PrependVisitor(DecoderVisitor* visitor); + void InsertVisitorBefore(DecoderVisitor* new_visitor, + DecoderVisitor* registered_visitor); + void InsertVisitorAfter(DecoderVisitor* new_visitor, + DecoderVisitor* registered_visitor); + + // Remove a previously registered visitor class from the list of visitors + // stored by the decoder. + void RemoveVisitor(DecoderVisitor* visitor); + + #define DECLARE(A) void Visit##A(Instruction* instr); + VISITOR_LIST(DECLARE) + #undef DECLARE + + private: + // Decode the PC relative addressing instruction, and call the corresponding + // visitors. + // On entry, instruction bits 27:24 = 0x0. + void DecodePCRelAddressing(Instruction* instr); + + // Decode the add/subtract immediate instruction, and call the correspoding + // visitors. + // On entry, instruction bits 27:24 = 0x1. + void DecodeAddSubImmediate(Instruction* instr); + + // Decode the branch, system command, and exception generation parts of + // the instruction tree, and call the corresponding visitors. + // On entry, instruction bits 27:24 = {0x4, 0x5, 0x6, 0x7}. + void DecodeBranchSystemException(Instruction* instr); + + // Decode the load and store parts of the instruction tree, and call + // the corresponding visitors. + // On entry, instruction bits 27:24 = {0x8, 0x9, 0xC, 0xD}. + void DecodeLoadStore(Instruction* instr); + + // Decode the logical immediate and move wide immediate parts of the + // instruction tree, and call the corresponding visitors. + // On entry, instruction bits 27:24 = 0x2. + void DecodeLogical(Instruction* instr); + + // Decode the bitfield and extraction parts of the instruction tree, + // and call the corresponding visitors. + // On entry, instruction bits 27:24 = 0x3. + void DecodeBitfieldExtract(Instruction* instr); + + // Decode the data processing parts of the instruction tree, and call the + // corresponding visitors. + // On entry, instruction bits 27:24 = {0x1, 0xA, 0xB}. + void DecodeDataProcessing(Instruction* instr); + + // Decode the floating point parts of the instruction tree, and call the + // corresponding visitors. + // On entry, instruction bits 27:24 = {0xE, 0xF}. + void DecodeFP(Instruction* instr); + + // Decode the Advanced SIMD (NEON) load/store part of the instruction tree, + // and call the corresponding visitors. + // On entry, instruction bits 29:25 = 0x6. + void DecodeAdvSIMDLoadStore(Instruction* instr); + + // Decode the Advanced SIMD (NEON) data processing part of the instruction + // tree, and call the corresponding visitors. + // On entry, instruction bits 27:25 = 0x7. + void DecodeAdvSIMDDataProcessing(Instruction* instr); +}; +} // namespace vixl + +#endif // VIXL_A64_DECODER_A64_H_ diff --git a/disas/libvixl/a64/disasm-a64.cc b/disas/libvixl/a64/disasm-a64.cc new file mode 100644 index 0000000000..aa133a99bf --- /dev/null +++ b/disas/libvixl/a64/disasm-a64.cc @@ -0,0 +1,1723 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "a64/disasm-a64.h" + +namespace vixl { + +Disassembler::Disassembler() { + buffer_size_ = 256; + buffer_ = reinterpret_cast(malloc(buffer_size_)); + buffer_pos_ = 0; + own_buffer_ = true; +} + + +Disassembler::Disassembler(char* text_buffer, int buffer_size) { + buffer_size_ = buffer_size; + buffer_ = text_buffer; + buffer_pos_ = 0; + own_buffer_ = false; +} + + +Disassembler::~Disassembler() { + if (own_buffer_) { + free(buffer_); + } +} + + +char* Disassembler::GetOutput() { + return buffer_; +} + + +void Disassembler::VisitAddSubImmediate(Instruction* instr) { + bool rd_is_zr = RdIsZROrSP(instr); + bool stack_op = (rd_is_zr || RnIsZROrSP(instr)) && + (instr->ImmAddSub() == 0) ? true : false; + const char *mnemonic = ""; + const char *form = "'Rds, 'Rns, 'IAddSub"; + const char *form_cmp = "'Rns, 'IAddSub"; + const char *form_mov = "'Rds, 'Rns"; + + switch (instr->Mask(AddSubImmediateMask)) { + case ADD_w_imm: + case ADD_x_imm: { + mnemonic = "add"; + if (stack_op) { + mnemonic = "mov"; + form = form_mov; + } + break; + } + case ADDS_w_imm: + case ADDS_x_imm: { + mnemonic = "adds"; + if (rd_is_zr) { + mnemonic = "cmn"; + form = form_cmp; + } + break; + } + case SUB_w_imm: + case SUB_x_imm: mnemonic = "sub"; break; + case SUBS_w_imm: + case SUBS_x_imm: { + mnemonic = "subs"; + if (rd_is_zr) { + mnemonic = "cmp"; + form = form_cmp; + } + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitAddSubShifted(Instruction* instr) { + bool rd_is_zr = RdIsZROrSP(instr); + bool rn_is_zr = RnIsZROrSP(instr); + const char *mnemonic = ""; + const char *form = "'Rd, 'Rn, 'Rm'HDP"; + const char *form_cmp = "'Rn, 'Rm'HDP"; + const char *form_neg = "'Rd, 'Rm'HDP"; + + switch (instr->Mask(AddSubShiftedMask)) { + case ADD_w_shift: + case ADD_x_shift: mnemonic = "add"; break; + case ADDS_w_shift: + case ADDS_x_shift: { + mnemonic = "adds"; + if (rd_is_zr) { + mnemonic = "cmn"; + form = form_cmp; + } + break; + } + case SUB_w_shift: + case SUB_x_shift: { + mnemonic = "sub"; + if (rn_is_zr) { + mnemonic = "neg"; + form = form_neg; + } + break; + } + case SUBS_w_shift: + case SUBS_x_shift: { + mnemonic = "subs"; + if (rd_is_zr) { + mnemonic = "cmp"; + form = form_cmp; + } else if (rn_is_zr) { + mnemonic = "negs"; + form = form_neg; + } + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitAddSubExtended(Instruction* instr) { + bool rd_is_zr = RdIsZROrSP(instr); + const char *mnemonic = ""; + Extend mode = static_cast(instr->ExtendMode()); + const char *form = ((mode == UXTX) || (mode == SXTX)) ? + "'Rds, 'Rns, 'Xm'Ext" : "'Rds, 'Rns, 'Wm'Ext"; + const char *form_cmp = ((mode == UXTX) || (mode == SXTX)) ? + "'Rns, 'Xm'Ext" : "'Rns, 'Wm'Ext"; + + switch (instr->Mask(AddSubExtendedMask)) { + case ADD_w_ext: + case ADD_x_ext: mnemonic = "add"; break; + case ADDS_w_ext: + case ADDS_x_ext: { + mnemonic = "adds"; + if (rd_is_zr) { + mnemonic = "cmn"; + form = form_cmp; + } + break; + } + case SUB_w_ext: + case SUB_x_ext: mnemonic = "sub"; break; + case SUBS_w_ext: + case SUBS_x_ext: { + mnemonic = "subs"; + if (rd_is_zr) { + mnemonic = "cmp"; + form = form_cmp; + } + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitAddSubWithCarry(Instruction* instr) { + bool rn_is_zr = RnIsZROrSP(instr); + const char *mnemonic = ""; + const char *form = "'Rd, 'Rn, 'Rm"; + const char *form_neg = "'Rd, 'Rm"; + + switch (instr->Mask(AddSubWithCarryMask)) { + case ADC_w: + case ADC_x: mnemonic = "adc"; break; + case ADCS_w: + case ADCS_x: mnemonic = "adcs"; break; + case SBC_w: + case SBC_x: { + mnemonic = "sbc"; + if (rn_is_zr) { + mnemonic = "ngc"; + form = form_neg; + } + break; + } + case SBCS_w: + case SBCS_x: { + mnemonic = "sbcs"; + if (rn_is_zr) { + mnemonic = "ngcs"; + form = form_neg; + } + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLogicalImmediate(Instruction* instr) { + bool rd_is_zr = RdIsZROrSP(instr); + bool rn_is_zr = RnIsZROrSP(instr); + const char *mnemonic = ""; + const char *form = "'Rds, 'Rn, 'ITri"; + + if (instr->ImmLogical() == 0) { + // The immediate encoded in the instruction is not in the expected format. + Format(instr, "unallocated", "(LogicalImmediate)"); + return; + } + + switch (instr->Mask(LogicalImmediateMask)) { + case AND_w_imm: + case AND_x_imm: mnemonic = "and"; break; + case ORR_w_imm: + case ORR_x_imm: { + mnemonic = "orr"; + unsigned reg_size = (instr->SixtyFourBits() == 1) ? kXRegSize + : kWRegSize; + if (rn_is_zr && !IsMovzMovnImm(reg_size, instr->ImmLogical())) { + mnemonic = "mov"; + form = "'Rds, 'ITri"; + } + break; + } + case EOR_w_imm: + case EOR_x_imm: mnemonic = "eor"; break; + case ANDS_w_imm: + case ANDS_x_imm: { + mnemonic = "ands"; + if (rd_is_zr) { + mnemonic = "tst"; + form = "'Rn, 'ITri"; + } + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +bool Disassembler::IsMovzMovnImm(unsigned reg_size, uint64_t value) { + VIXL_ASSERT((reg_size == kXRegSize) || + ((reg_size == kWRegSize) && (value <= 0xffffffff))); + + // Test for movz: 16 bits set at positions 0, 16, 32 or 48. + if (((value & UINT64_C(0xffffffffffff0000)) == 0) || + ((value & UINT64_C(0xffffffff0000ffff)) == 0) || + ((value & UINT64_C(0xffff0000ffffffff)) == 0) || + ((value & UINT64_C(0x0000ffffffffffff)) == 0)) { + return true; + } + + // Test for movn: NOT(16 bits set at positions 0, 16, 32 or 48). + if ((reg_size == kXRegSize) && + (((~value & UINT64_C(0xffffffffffff0000)) == 0) || + ((~value & UINT64_C(0xffffffff0000ffff)) == 0) || + ((~value & UINT64_C(0xffff0000ffffffff)) == 0) || + ((~value & UINT64_C(0x0000ffffffffffff)) == 0))) { + return true; + } + if ((reg_size == kWRegSize) && + (((value & 0xffff0000) == 0xffff0000) || + ((value & 0x0000ffff) == 0x0000ffff))) { + return true; + } + return false; +} + + +void Disassembler::VisitLogicalShifted(Instruction* instr) { + bool rd_is_zr = RdIsZROrSP(instr); + bool rn_is_zr = RnIsZROrSP(instr); + const char *mnemonic = ""; + const char *form = "'Rd, 'Rn, 'Rm'HLo"; + + switch (instr->Mask(LogicalShiftedMask)) { + case AND_w: + case AND_x: mnemonic = "and"; break; + case BIC_w: + case BIC_x: mnemonic = "bic"; break; + case EOR_w: + case EOR_x: mnemonic = "eor"; break; + case EON_w: + case EON_x: mnemonic = "eon"; break; + case BICS_w: + case BICS_x: mnemonic = "bics"; break; + case ANDS_w: + case ANDS_x: { + mnemonic = "ands"; + if (rd_is_zr) { + mnemonic = "tst"; + form = "'Rn, 'Rm'HLo"; + } + break; + } + case ORR_w: + case ORR_x: { + mnemonic = "orr"; + if (rn_is_zr && (instr->ImmDPShift() == 0) && (instr->ShiftDP() == LSL)) { + mnemonic = "mov"; + form = "'Rd, 'Rm"; + } + break; + } + case ORN_w: + case ORN_x: { + mnemonic = "orn"; + if (rn_is_zr) { + mnemonic = "mvn"; + form = "'Rd, 'Rm'HLo"; + } + break; + } + default: VIXL_UNREACHABLE(); + } + + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitConditionalCompareRegister(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Rn, 'Rm, 'INzcv, 'Cond"; + + switch (instr->Mask(ConditionalCompareRegisterMask)) { + case CCMN_w: + case CCMN_x: mnemonic = "ccmn"; break; + case CCMP_w: + case CCMP_x: mnemonic = "ccmp"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitConditionalCompareImmediate(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Rn, 'IP, 'INzcv, 'Cond"; + + switch (instr->Mask(ConditionalCompareImmediateMask)) { + case CCMN_w_imm: + case CCMN_x_imm: mnemonic = "ccmn"; break; + case CCMP_w_imm: + case CCMP_x_imm: mnemonic = "ccmp"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitConditionalSelect(Instruction* instr) { + bool rnm_is_zr = (RnIsZROrSP(instr) && RmIsZROrSP(instr)); + bool rn_is_rm = (instr->Rn() == instr->Rm()); + const char *mnemonic = ""; + const char *form = "'Rd, 'Rn, 'Rm, 'Cond"; + const char *form_test = "'Rd, 'CInv"; + const char *form_update = "'Rd, 'Rn, 'CInv"; + + Condition cond = static_cast(instr->Condition()); + bool invertible_cond = (cond != al) && (cond != nv); + + switch (instr->Mask(ConditionalSelectMask)) { + case CSEL_w: + case CSEL_x: mnemonic = "csel"; break; + case CSINC_w: + case CSINC_x: { + mnemonic = "csinc"; + if (rnm_is_zr && invertible_cond) { + mnemonic = "cset"; + form = form_test; + } else if (rn_is_rm && invertible_cond) { + mnemonic = "cinc"; + form = form_update; + } + break; + } + case CSINV_w: + case CSINV_x: { + mnemonic = "csinv"; + if (rnm_is_zr && invertible_cond) { + mnemonic = "csetm"; + form = form_test; + } else if (rn_is_rm && invertible_cond) { + mnemonic = "cinv"; + form = form_update; + } + break; + } + case CSNEG_w: + case CSNEG_x: { + mnemonic = "csneg"; + if (rn_is_rm && invertible_cond) { + mnemonic = "cneg"; + form = form_update; + } + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitBitfield(Instruction* instr) { + unsigned s = instr->ImmS(); + unsigned r = instr->ImmR(); + unsigned rd_size_minus_1 = + ((instr->SixtyFourBits() == 1) ? kXRegSize : kWRegSize) - 1; + const char *mnemonic = ""; + const char *form = ""; + const char *form_shift_right = "'Rd, 'Rn, 'IBr"; + const char *form_extend = "'Rd, 'Wn"; + const char *form_bfiz = "'Rd, 'Rn, 'IBZ-r, 'IBs+1"; + const char *form_bfx = "'Rd, 'Rn, 'IBr, 'IBs-r+1"; + const char *form_lsl = "'Rd, 'Rn, 'IBZ-r"; + + switch (instr->Mask(BitfieldMask)) { + case SBFM_w: + case SBFM_x: { + mnemonic = "sbfx"; + form = form_bfx; + if (r == 0) { + form = form_extend; + if (s == 7) { + mnemonic = "sxtb"; + } else if (s == 15) { + mnemonic = "sxth"; + } else if ((s == 31) && (instr->SixtyFourBits() == 1)) { + mnemonic = "sxtw"; + } else { + form = form_bfx; + } + } else if (s == rd_size_minus_1) { + mnemonic = "asr"; + form = form_shift_right; + } else if (s < r) { + mnemonic = "sbfiz"; + form = form_bfiz; + } + break; + } + case UBFM_w: + case UBFM_x: { + mnemonic = "ubfx"; + form = form_bfx; + if (r == 0) { + form = form_extend; + if (s == 7) { + mnemonic = "uxtb"; + } else if (s == 15) { + mnemonic = "uxth"; + } else { + form = form_bfx; + } + } + if (s == rd_size_minus_1) { + mnemonic = "lsr"; + form = form_shift_right; + } else if (r == s + 1) { + mnemonic = "lsl"; + form = form_lsl; + } else if (s < r) { + mnemonic = "ubfiz"; + form = form_bfiz; + } + break; + } + case BFM_w: + case BFM_x: { + mnemonic = "bfxil"; + form = form_bfx; + if (s < r) { + mnemonic = "bfi"; + form = form_bfiz; + } + } + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitExtract(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Rd, 'Rn, 'Rm, 'IExtract"; + + switch (instr->Mask(ExtractMask)) { + case EXTR_w: + case EXTR_x: { + if (instr->Rn() == instr->Rm()) { + mnemonic = "ror"; + form = "'Rd, 'Rn, 'IExtract"; + } else { + mnemonic = "extr"; + } + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitPCRelAddressing(Instruction* instr) { + switch (instr->Mask(PCRelAddressingMask)) { + case ADR: Format(instr, "adr", "'Xd, 'AddrPCRelByte"); break; + // ADRP is not implemented. + default: Format(instr, "unimplemented", "(PCRelAddressing)"); + } +} + + +void Disassembler::VisitConditionalBranch(Instruction* instr) { + switch (instr->Mask(ConditionalBranchMask)) { + case B_cond: Format(instr, "b.'CBrn", "'BImmCond"); break; + default: VIXL_UNREACHABLE(); + } +} + + +void Disassembler::VisitUnconditionalBranchToRegister(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "'Xn"; + + switch (instr->Mask(UnconditionalBranchToRegisterMask)) { + case BR: mnemonic = "br"; break; + case BLR: mnemonic = "blr"; break; + case RET: { + mnemonic = "ret"; + if (instr->Rn() == kLinkRegCode) { + form = NULL; + } + break; + } + default: form = "(UnconditionalBranchToRegister)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitUnconditionalBranch(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'BImmUncn"; + + switch (instr->Mask(UnconditionalBranchMask)) { + case B: mnemonic = "b"; break; + case BL: mnemonic = "bl"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitDataProcessing1Source(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Rd, 'Rn"; + + switch (instr->Mask(DataProcessing1SourceMask)) { + #define FORMAT(A, B) \ + case A##_w: \ + case A##_x: mnemonic = B; break; + FORMAT(RBIT, "rbit"); + FORMAT(REV16, "rev16"); + FORMAT(REV, "rev"); + FORMAT(CLZ, "clz"); + FORMAT(CLS, "cls"); + #undef FORMAT + case REV32_x: mnemonic = "rev32"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitDataProcessing2Source(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "'Rd, 'Rn, 'Rm"; + + switch (instr->Mask(DataProcessing2SourceMask)) { + #define FORMAT(A, B) \ + case A##_w: \ + case A##_x: mnemonic = B; break; + FORMAT(UDIV, "udiv"); + FORMAT(SDIV, "sdiv"); + FORMAT(LSLV, "lsl"); + FORMAT(LSRV, "lsr"); + FORMAT(ASRV, "asr"); + FORMAT(RORV, "ror"); + #undef FORMAT + default: form = "(DataProcessing2Source)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitDataProcessing3Source(Instruction* instr) { + bool ra_is_zr = RaIsZROrSP(instr); + const char *mnemonic = ""; + const char *form = "'Xd, 'Wn, 'Wm, 'Xa"; + const char *form_rrr = "'Rd, 'Rn, 'Rm"; + const char *form_rrrr = "'Rd, 'Rn, 'Rm, 'Ra"; + const char *form_xww = "'Xd, 'Wn, 'Wm"; + const char *form_xxx = "'Xd, 'Xn, 'Xm"; + + switch (instr->Mask(DataProcessing3SourceMask)) { + case MADD_w: + case MADD_x: { + mnemonic = "madd"; + form = form_rrrr; + if (ra_is_zr) { + mnemonic = "mul"; + form = form_rrr; + } + break; + } + case MSUB_w: + case MSUB_x: { + mnemonic = "msub"; + form = form_rrrr; + if (ra_is_zr) { + mnemonic = "mneg"; + form = form_rrr; + } + break; + } + case SMADDL_x: { + mnemonic = "smaddl"; + if (ra_is_zr) { + mnemonic = "smull"; + form = form_xww; + } + break; + } + case SMSUBL_x: { + mnemonic = "smsubl"; + if (ra_is_zr) { + mnemonic = "smnegl"; + form = form_xww; + } + break; + } + case UMADDL_x: { + mnemonic = "umaddl"; + if (ra_is_zr) { + mnemonic = "umull"; + form = form_xww; + } + break; + } + case UMSUBL_x: { + mnemonic = "umsubl"; + if (ra_is_zr) { + mnemonic = "umnegl"; + form = form_xww; + } + break; + } + case SMULH_x: { + mnemonic = "smulh"; + form = form_xxx; + break; + } + case UMULH_x: { + mnemonic = "umulh"; + form = form_xxx; + break; + } + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitCompareBranch(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Rt, 'BImmCmpa"; + + switch (instr->Mask(CompareBranchMask)) { + case CBZ_w: + case CBZ_x: mnemonic = "cbz"; break; + case CBNZ_w: + case CBNZ_x: mnemonic = "cbnz"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitTestBranch(Instruction* instr) { + const char *mnemonic = ""; + // If the top bit of the immediate is clear, the tested register is + // disassembled as Wt, otherwise Xt. As the top bit of the immediate is + // encoded in bit 31 of the instruction, we can reuse the Rt form, which + // uses bit 31 (normally "sf") to choose the register size. + const char *form = "'Rt, 'IS, 'BImmTest"; + + switch (instr->Mask(TestBranchMask)) { + case TBZ: mnemonic = "tbz"; break; + case TBNZ: mnemonic = "tbnz"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitMoveWideImmediate(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Rd, 'IMoveImm"; + + // Print the shift separately for movk, to make it clear which half word will + // be overwritten. Movn and movz print the computed immediate, which includes + // shift calculation. + switch (instr->Mask(MoveWideImmediateMask)) { + case MOVN_w: + case MOVN_x: mnemonic = "movn"; break; + case MOVZ_w: + case MOVZ_x: mnemonic = "movz"; break; + case MOVK_w: + case MOVK_x: mnemonic = "movk"; form = "'Rd, 'IMoveLSL"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +#define LOAD_STORE_LIST(V) \ + V(STRB_w, "strb", "'Wt") \ + V(STRH_w, "strh", "'Wt") \ + V(STR_w, "str", "'Wt") \ + V(STR_x, "str", "'Xt") \ + V(LDRB_w, "ldrb", "'Wt") \ + V(LDRH_w, "ldrh", "'Wt") \ + V(LDR_w, "ldr", "'Wt") \ + V(LDR_x, "ldr", "'Xt") \ + V(LDRSB_x, "ldrsb", "'Xt") \ + V(LDRSH_x, "ldrsh", "'Xt") \ + V(LDRSW_x, "ldrsw", "'Xt") \ + V(LDRSB_w, "ldrsb", "'Wt") \ + V(LDRSH_w, "ldrsh", "'Wt") \ + V(STR_s, "str", "'St") \ + V(STR_d, "str", "'Dt") \ + V(LDR_s, "ldr", "'St") \ + V(LDR_d, "ldr", "'Dt") + +void Disassembler::VisitLoadStorePreIndex(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(LoadStorePreIndex)"; + + switch (instr->Mask(LoadStorePreIndexMask)) { + #define LS_PREINDEX(A, B, C) \ + case A##_pre: mnemonic = B; form = C ", ['Xns'ILS]!"; break; + LOAD_STORE_LIST(LS_PREINDEX) + #undef LS_PREINDEX + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadStorePostIndex(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(LoadStorePostIndex)"; + + switch (instr->Mask(LoadStorePostIndexMask)) { + #define LS_POSTINDEX(A, B, C) \ + case A##_post: mnemonic = B; form = C ", ['Xns]'ILS"; break; + LOAD_STORE_LIST(LS_POSTINDEX) + #undef LS_POSTINDEX + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadStoreUnsignedOffset(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(LoadStoreUnsignedOffset)"; + + switch (instr->Mask(LoadStoreUnsignedOffsetMask)) { + #define LS_UNSIGNEDOFFSET(A, B, C) \ + case A##_unsigned: mnemonic = B; form = C ", ['Xns'ILU]"; break; + LOAD_STORE_LIST(LS_UNSIGNEDOFFSET) + #undef LS_UNSIGNEDOFFSET + case PRFM_unsigned: mnemonic = "prfm"; form = "'PrefOp, ['Xn'ILU]"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadStoreRegisterOffset(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(LoadStoreRegisterOffset)"; + + switch (instr->Mask(LoadStoreRegisterOffsetMask)) { + #define LS_REGISTEROFFSET(A, B, C) \ + case A##_reg: mnemonic = B; form = C ", ['Xns, 'Offsetreg]"; break; + LOAD_STORE_LIST(LS_REGISTEROFFSET) + #undef LS_REGISTEROFFSET + case PRFM_reg: mnemonic = "prfm"; form = "'PrefOp, ['Xns, 'Offsetreg]"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadStoreUnscaledOffset(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "'Wt, ['Xns'ILS]"; + const char *form_x = "'Xt, ['Xns'ILS]"; + const char *form_s = "'St, ['Xns'ILS]"; + const char *form_d = "'Dt, ['Xns'ILS]"; + + switch (instr->Mask(LoadStoreUnscaledOffsetMask)) { + case STURB_w: mnemonic = "sturb"; break; + case STURH_w: mnemonic = "sturh"; break; + case STUR_w: mnemonic = "stur"; break; + case STUR_x: mnemonic = "stur"; form = form_x; break; + case STUR_s: mnemonic = "stur"; form = form_s; break; + case STUR_d: mnemonic = "stur"; form = form_d; break; + case LDURB_w: mnemonic = "ldurb"; break; + case LDURH_w: mnemonic = "ldurh"; break; + case LDUR_w: mnemonic = "ldur"; break; + case LDUR_x: mnemonic = "ldur"; form = form_x; break; + case LDUR_s: mnemonic = "ldur"; form = form_s; break; + case LDUR_d: mnemonic = "ldur"; form = form_d; break; + case LDURSB_x: form = form_x; // Fall through. + case LDURSB_w: mnemonic = "ldursb"; break; + case LDURSH_x: form = form_x; // Fall through. + case LDURSH_w: mnemonic = "ldursh"; break; + case LDURSW_x: mnemonic = "ldursw"; form = form_x; break; + default: form = "(LoadStoreUnscaledOffset)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadLiteral(Instruction* instr) { + const char *mnemonic = "ldr"; + const char *form = "(LoadLiteral)"; + + switch (instr->Mask(LoadLiteralMask)) { + case LDR_w_lit: form = "'Wt, 'ILLiteral 'LValue"; break; + case LDR_x_lit: form = "'Xt, 'ILLiteral 'LValue"; break; + case LDR_s_lit: form = "'St, 'ILLiteral 'LValue"; break; + case LDR_d_lit: form = "'Dt, 'ILLiteral 'LValue"; break; + default: mnemonic = "unimplemented"; + } + Format(instr, mnemonic, form); +} + + +#define LOAD_STORE_PAIR_LIST(V) \ + V(STP_w, "stp", "'Wt, 'Wt2", "4") \ + V(LDP_w, "ldp", "'Wt, 'Wt2", "4") \ + V(LDPSW_x, "ldpsw", "'Xt, 'Xt2", "4") \ + V(STP_x, "stp", "'Xt, 'Xt2", "8") \ + V(LDP_x, "ldp", "'Xt, 'Xt2", "8") \ + V(STP_s, "stp", "'St, 'St2", "4") \ + V(LDP_s, "ldp", "'St, 'St2", "4") \ + V(STP_d, "stp", "'Dt, 'Dt2", "8") \ + V(LDP_d, "ldp", "'Dt, 'Dt2", "8") + +void Disassembler::VisitLoadStorePairPostIndex(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(LoadStorePairPostIndex)"; + + switch (instr->Mask(LoadStorePairPostIndexMask)) { + #define LSP_POSTINDEX(A, B, C, D) \ + case A##_post: mnemonic = B; form = C ", ['Xns]'ILP" D; break; + LOAD_STORE_PAIR_LIST(LSP_POSTINDEX) + #undef LSP_POSTINDEX + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadStorePairPreIndex(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(LoadStorePairPreIndex)"; + + switch (instr->Mask(LoadStorePairPreIndexMask)) { + #define LSP_PREINDEX(A, B, C, D) \ + case A##_pre: mnemonic = B; form = C ", ['Xns'ILP" D "]!"; break; + LOAD_STORE_PAIR_LIST(LSP_PREINDEX) + #undef LSP_PREINDEX + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadStorePairOffset(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(LoadStorePairOffset)"; + + switch (instr->Mask(LoadStorePairOffsetMask)) { + #define LSP_OFFSET(A, B, C, D) \ + case A##_off: mnemonic = B; form = C ", ['Xns'ILP" D "]"; break; + LOAD_STORE_PAIR_LIST(LSP_OFFSET) + #undef LSP_OFFSET + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitLoadStorePairNonTemporal(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form; + + switch (instr->Mask(LoadStorePairNonTemporalMask)) { + case STNP_w: mnemonic = "stnp"; form = "'Wt, 'Wt2, ['Xns'ILP4]"; break; + case LDNP_w: mnemonic = "ldnp"; form = "'Wt, 'Wt2, ['Xns'ILP4]"; break; + case STNP_x: mnemonic = "stnp"; form = "'Xt, 'Xt2, ['Xns'ILP8]"; break; + case LDNP_x: mnemonic = "ldnp"; form = "'Xt, 'Xt2, ['Xns'ILP8]"; break; + case STNP_s: mnemonic = "stnp"; form = "'St, 'St2, ['Xns'ILP4]"; break; + case LDNP_s: mnemonic = "ldnp"; form = "'St, 'St2, ['Xns'ILP4]"; break; + case STNP_d: mnemonic = "stnp"; form = "'Dt, 'Dt2, ['Xns'ILP8]"; break; + case LDNP_d: mnemonic = "ldnp"; form = "'Dt, 'Dt2, ['Xns'ILP8]"; break; + default: form = "(LoadStorePairNonTemporal)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPCompare(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "'Fn, 'Fm"; + const char *form_zero = "'Fn, #0.0"; + + switch (instr->Mask(FPCompareMask)) { + case FCMP_s_zero: + case FCMP_d_zero: form = form_zero; // Fall through. + case FCMP_s: + case FCMP_d: mnemonic = "fcmp"; break; + default: form = "(FPCompare)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPConditionalCompare(Instruction* instr) { + const char *mnemonic = "unmplemented"; + const char *form = "'Fn, 'Fm, 'INzcv, 'Cond"; + + switch (instr->Mask(FPConditionalCompareMask)) { + case FCCMP_s: + case FCCMP_d: mnemonic = "fccmp"; break; + case FCCMPE_s: + case FCCMPE_d: mnemonic = "fccmpe"; break; + default: form = "(FPConditionalCompare)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPConditionalSelect(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Fd, 'Fn, 'Fm, 'Cond"; + + switch (instr->Mask(FPConditionalSelectMask)) { + case FCSEL_s: + case FCSEL_d: mnemonic = "fcsel"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPDataProcessing1Source(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "'Fd, 'Fn"; + + switch (instr->Mask(FPDataProcessing1SourceMask)) { + #define FORMAT(A, B) \ + case A##_s: \ + case A##_d: mnemonic = B; break; + FORMAT(FMOV, "fmov"); + FORMAT(FABS, "fabs"); + FORMAT(FNEG, "fneg"); + FORMAT(FSQRT, "fsqrt"); + FORMAT(FRINTN, "frintn"); + FORMAT(FRINTP, "frintp"); + FORMAT(FRINTM, "frintm"); + FORMAT(FRINTZ, "frintz"); + FORMAT(FRINTA, "frinta"); + FORMAT(FRINTX, "frintx"); + FORMAT(FRINTI, "frinti"); + #undef FORMAT + case FCVT_ds: mnemonic = "fcvt"; form = "'Dd, 'Sn"; break; + case FCVT_sd: mnemonic = "fcvt"; form = "'Sd, 'Dn"; break; + default: form = "(FPDataProcessing1Source)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPDataProcessing2Source(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Fd, 'Fn, 'Fm"; + + switch (instr->Mask(FPDataProcessing2SourceMask)) { + #define FORMAT(A, B) \ + case A##_s: \ + case A##_d: mnemonic = B; break; + FORMAT(FMUL, "fmul"); + FORMAT(FDIV, "fdiv"); + FORMAT(FADD, "fadd"); + FORMAT(FSUB, "fsub"); + FORMAT(FMAX, "fmax"); + FORMAT(FMIN, "fmin"); + FORMAT(FMAXNM, "fmaxnm"); + FORMAT(FMINNM, "fminnm"); + FORMAT(FNMUL, "fnmul"); + #undef FORMAT + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPDataProcessing3Source(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Fd, 'Fn, 'Fm, 'Fa"; + + switch (instr->Mask(FPDataProcessing3SourceMask)) { + #define FORMAT(A, B) \ + case A##_s: \ + case A##_d: mnemonic = B; break; + FORMAT(FMADD, "fmadd"); + FORMAT(FMSUB, "fmsub"); + FORMAT(FNMADD, "fnmadd"); + FORMAT(FNMSUB, "fnmsub"); + #undef FORMAT + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPImmediate(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "(FPImmediate)"; + + switch (instr->Mask(FPImmediateMask)) { + case FMOV_s_imm: mnemonic = "fmov"; form = "'Sd, 'IFPSingle"; break; + case FMOV_d_imm: mnemonic = "fmov"; form = "'Dd, 'IFPDouble"; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPIntegerConvert(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "(FPIntegerConvert)"; + const char *form_rf = "'Rd, 'Fn"; + const char *form_fr = "'Fd, 'Rn"; + + switch (instr->Mask(FPIntegerConvertMask)) { + case FMOV_ws: + case FMOV_xd: mnemonic = "fmov"; form = form_rf; break; + case FMOV_sw: + case FMOV_dx: mnemonic = "fmov"; form = form_fr; break; + case FCVTAS_ws: + case FCVTAS_xs: + case FCVTAS_wd: + case FCVTAS_xd: mnemonic = "fcvtas"; form = form_rf; break; + case FCVTAU_ws: + case FCVTAU_xs: + case FCVTAU_wd: + case FCVTAU_xd: mnemonic = "fcvtau"; form = form_rf; break; + case FCVTMS_ws: + case FCVTMS_xs: + case FCVTMS_wd: + case FCVTMS_xd: mnemonic = "fcvtms"; form = form_rf; break; + case FCVTMU_ws: + case FCVTMU_xs: + case FCVTMU_wd: + case FCVTMU_xd: mnemonic = "fcvtmu"; form = form_rf; break; + case FCVTNS_ws: + case FCVTNS_xs: + case FCVTNS_wd: + case FCVTNS_xd: mnemonic = "fcvtns"; form = form_rf; break; + case FCVTNU_ws: + case FCVTNU_xs: + case FCVTNU_wd: + case FCVTNU_xd: mnemonic = "fcvtnu"; form = form_rf; break; + case FCVTZU_xd: + case FCVTZU_ws: + case FCVTZU_wd: + case FCVTZU_xs: mnemonic = "fcvtzu"; form = form_rf; break; + case FCVTZS_xd: + case FCVTZS_wd: + case FCVTZS_xs: + case FCVTZS_ws: mnemonic = "fcvtzs"; form = form_rf; break; + case SCVTF_sw: + case SCVTF_sx: + case SCVTF_dw: + case SCVTF_dx: mnemonic = "scvtf"; form = form_fr; break; + case UCVTF_sw: + case UCVTF_sx: + case UCVTF_dw: + case UCVTF_dx: mnemonic = "ucvtf"; form = form_fr; break; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitFPFixedPointConvert(Instruction* instr) { + const char *mnemonic = ""; + const char *form = "'Rd, 'Fn, 'IFPFBits"; + const char *form_fr = "'Fd, 'Rn, 'IFPFBits"; + + switch (instr->Mask(FPFixedPointConvertMask)) { + case FCVTZS_ws_fixed: + case FCVTZS_xs_fixed: + case FCVTZS_wd_fixed: + case FCVTZS_xd_fixed: mnemonic = "fcvtzs"; break; + case FCVTZU_ws_fixed: + case FCVTZU_xs_fixed: + case FCVTZU_wd_fixed: + case FCVTZU_xd_fixed: mnemonic = "fcvtzu"; break; + case SCVTF_sw_fixed: + case SCVTF_sx_fixed: + case SCVTF_dw_fixed: + case SCVTF_dx_fixed: mnemonic = "scvtf"; form = form_fr; break; + case UCVTF_sw_fixed: + case UCVTF_sx_fixed: + case UCVTF_dw_fixed: + case UCVTF_dx_fixed: mnemonic = "ucvtf"; form = form_fr; break; + default: VIXL_UNREACHABLE(); + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitSystem(Instruction* instr) { + // Some system instructions hijack their Op and Cp fields to represent a + // range of immediates instead of indicating a different instruction. This + // makes the decoding tricky. + const char *mnemonic = "unimplemented"; + const char *form = "(System)"; + + if (instr->Mask(SystemSysRegFMask) == SystemSysRegFixed) { + switch (instr->Mask(SystemSysRegMask)) { + case MRS: { + mnemonic = "mrs"; + switch (instr->ImmSystemRegister()) { + case NZCV: form = "'Xt, nzcv"; break; + case FPCR: form = "'Xt, fpcr"; break; + default: form = "'Xt, (unknown)"; break; + } + break; + } + case MSR: { + mnemonic = "msr"; + switch (instr->ImmSystemRegister()) { + case NZCV: form = "nzcv, 'Xt"; break; + case FPCR: form = "fpcr, 'Xt"; break; + default: form = "(unknown), 'Xt"; break; + } + break; + } + } + } else if (instr->Mask(SystemHintFMask) == SystemHintFixed) { + VIXL_ASSERT(instr->Mask(SystemHintMask) == HINT); + switch (instr->ImmHint()) { + case NOP: { + mnemonic = "nop"; + form = NULL; + break; + } + } + } else if (instr->Mask(MemBarrierFMask) == MemBarrierFixed) { + switch (instr->Mask(MemBarrierMask)) { + case DMB: { + mnemonic = "dmb"; + form = "'M"; + break; + } + case DSB: { + mnemonic = "dsb"; + form = "'M"; + break; + } + case ISB: { + mnemonic = "isb"; + form = NULL; + break; + } + } + } + + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitException(Instruction* instr) { + const char *mnemonic = "unimplemented"; + const char *form = "'IDebug"; + + switch (instr->Mask(ExceptionMask)) { + case HLT: mnemonic = "hlt"; break; + case BRK: mnemonic = "brk"; break; + case SVC: mnemonic = "svc"; break; + case HVC: mnemonic = "hvc"; break; + case SMC: mnemonic = "smc"; break; + case DCPS1: mnemonic = "dcps1"; form = "{'IDebug}"; break; + case DCPS2: mnemonic = "dcps2"; form = "{'IDebug}"; break; + case DCPS3: mnemonic = "dcps3"; form = "{'IDebug}"; break; + default: form = "(Exception)"; + } + Format(instr, mnemonic, form); +} + + +void Disassembler::VisitUnimplemented(Instruction* instr) { + Format(instr, "unimplemented", "(Unimplemented)"); +} + + +void Disassembler::VisitUnallocated(Instruction* instr) { + Format(instr, "unallocated", "(Unallocated)"); +} + + +void Disassembler::ProcessOutput(Instruction* /*instr*/) { + // The base disasm does nothing more than disassembling into a buffer. +} + + +void Disassembler::Format(Instruction* instr, const char* mnemonic, + const char* format) { + VIXL_ASSERT(mnemonic != NULL); + ResetOutput(); + Substitute(instr, mnemonic); + if (format != NULL) { + buffer_[buffer_pos_++] = ' '; + Substitute(instr, format); + } + buffer_[buffer_pos_] = 0; + ProcessOutput(instr); +} + + +void Disassembler::Substitute(Instruction* instr, const char* string) { + char chr = *string++; + while (chr != '\0') { + if (chr == '\'') { + string += SubstituteField(instr, string); + } else { + buffer_[buffer_pos_++] = chr; + } + chr = *string++; + } +} + + +int Disassembler::SubstituteField(Instruction* instr, const char* format) { + switch (format[0]) { + case 'R': // Register. X or W, selected by sf bit. + case 'F': // FP Register. S or D, selected by type field. + case 'W': + case 'X': + case 'S': + case 'D': return SubstituteRegisterField(instr, format); + case 'I': return SubstituteImmediateField(instr, format); + case 'L': return SubstituteLiteralField(instr, format); + case 'H': return SubstituteShiftField(instr, format); + case 'P': return SubstitutePrefetchField(instr, format); + case 'C': return SubstituteConditionField(instr, format); + case 'E': return SubstituteExtendField(instr, format); + case 'A': return SubstitutePCRelAddressField(instr, format); + case 'B': return SubstituteBranchTargetField(instr, format); + case 'O': return SubstituteLSRegOffsetField(instr, format); + case 'M': return SubstituteBarrierField(instr, format); + default: { + VIXL_UNREACHABLE(); + return 1; + } + } +} + + +int Disassembler::SubstituteRegisterField(Instruction* instr, + const char* format) { + unsigned reg_num = 0; + unsigned field_len = 2; + switch (format[1]) { + case 'd': reg_num = instr->Rd(); break; + case 'n': reg_num = instr->Rn(); break; + case 'm': reg_num = instr->Rm(); break; + case 'a': reg_num = instr->Ra(); break; + case 't': { + if (format[2] == '2') { + reg_num = instr->Rt2(); + field_len = 3; + } else { + reg_num = instr->Rt(); + } + break; + } + default: VIXL_UNREACHABLE(); + } + + // Increase field length for registers tagged as stack. + if (format[2] == 's') { + field_len = 3; + } + + char reg_type; + if (format[0] == 'R') { + // Register type is R: use sf bit to choose X and W. + reg_type = instr->SixtyFourBits() ? 'x' : 'w'; + } else if (format[0] == 'F') { + // Floating-point register: use type field to choose S or D. + reg_type = ((instr->FPType() & 1) == 0) ? 's' : 'd'; + } else { + // Register type is specified. Make it lower case. + reg_type = format[0] + 0x20; + } + + if ((reg_num != kZeroRegCode) || (reg_type == 's') || (reg_type == 'd')) { + // A normal register: w0 - w30, x0 - x30, s0 - s31, d0 - d31. + AppendToOutput("%c%d", reg_type, reg_num); + } else if (format[2] == 's') { + // Disassemble w31/x31 as stack pointer wsp/sp. + AppendToOutput("%s", (reg_type == 'w') ? "wsp" : "sp"); + } else { + // Disassemble w31/x31 as zero register wzr/xzr. + AppendToOutput("%czr", reg_type); + } + + return field_len; +} + + +int Disassembler::SubstituteImmediateField(Instruction* instr, + const char* format) { + VIXL_ASSERT(format[0] == 'I'); + + switch (format[1]) { + case 'M': { // IMoveImm or IMoveLSL. + if (format[5] == 'I') { + uint64_t imm = instr->ImmMoveWide() << (16 * instr->ShiftMoveWide()); + AppendToOutput("#0x%" PRIx64, imm); + } else { + VIXL_ASSERT(format[5] == 'L'); + AppendToOutput("#0x%" PRIx64, instr->ImmMoveWide()); + if (instr->ShiftMoveWide() > 0) { + AppendToOutput(", lsl #%d", 16 * instr->ShiftMoveWide()); + } + } + return 8; + } + case 'L': { + switch (format[2]) { + case 'L': { // ILLiteral - Immediate Load Literal. + AppendToOutput("pc%+" PRId64, + instr->ImmLLiteral() << kLiteralEntrySizeLog2); + return 9; + } + case 'S': { // ILS - Immediate Load/Store. + if (instr->ImmLS() != 0) { + AppendToOutput(", #%" PRId64, instr->ImmLS()); + } + return 3; + } + case 'P': { // ILPx - Immediate Load/Store Pair, x = access size. + if (instr->ImmLSPair() != 0) { + // format[3] is the scale value. Convert to a number. + int scale = format[3] - 0x30; + AppendToOutput(", #%" PRId64, instr->ImmLSPair() * scale); + } + return 4; + } + case 'U': { // ILU - Immediate Load/Store Unsigned. + if (instr->ImmLSUnsigned() != 0) { + AppendToOutput(", #%" PRIu64, + instr->ImmLSUnsigned() << instr->SizeLS()); + } + return 3; + } + } + } + case 'C': { // ICondB - Immediate Conditional Branch. + int64_t offset = instr->ImmCondBranch() << 2; + char sign = (offset >= 0) ? '+' : '-'; + AppendToOutput("#%c0x%" PRIx64, sign, offset); + return 6; + } + case 'A': { // IAddSub. + VIXL_ASSERT(instr->ShiftAddSub() <= 1); + int64_t imm = instr->ImmAddSub() << (12 * instr->ShiftAddSub()); + AppendToOutput("#0x%" PRIx64 " (%" PRId64 ")", imm, imm); + return 7; + } + case 'F': { // IFPSingle, IFPDouble or IFPFBits. + if (format[3] == 'F') { // IFPFbits. + AppendToOutput("#%d", 64 - instr->FPScale()); + return 8; + } else { + AppendToOutput("#0x%" PRIx64 " (%.4f)", instr->ImmFP(), + format[3] == 'S' ? instr->ImmFP32() : instr->ImmFP64()); + return 9; + } + } + case 'T': { // ITri - Immediate Triangular Encoded. + AppendToOutput("#0x%" PRIx64, instr->ImmLogical()); + return 4; + } + case 'N': { // INzcv. + int nzcv = (instr->Nzcv() << Flags_offset); + AppendToOutput("#%c%c%c%c", ((nzcv & NFlag) == 0) ? 'n' : 'N', + ((nzcv & ZFlag) == 0) ? 'z' : 'Z', + ((nzcv & CFlag) == 0) ? 'c' : 'C', + ((nzcv & VFlag) == 0) ? 'v' : 'V'); + return 5; + } + case 'P': { // IP - Conditional compare. + AppendToOutput("#%d", instr->ImmCondCmp()); + return 2; + } + case 'B': { // Bitfields. + return SubstituteBitfieldImmediateField(instr, format); + } + case 'E': { // IExtract. + AppendToOutput("#%d", instr->ImmS()); + return 8; + } + case 'S': { // IS - Test and branch bit. + AppendToOutput("#%d", (instr->ImmTestBranchBit5() << 5) | + instr->ImmTestBranchBit40()); + return 2; + } + case 'D': { // IDebug - HLT and BRK instructions. + AppendToOutput("#0x%x", instr->ImmException()); + return 6; + } + default: { + VIXL_UNIMPLEMENTED(); + return 0; + } + } +} + + +int Disassembler::SubstituteBitfieldImmediateField(Instruction* instr, + const char* format) { + VIXL_ASSERT((format[0] == 'I') && (format[1] == 'B')); + unsigned r = instr->ImmR(); + unsigned s = instr->ImmS(); + + switch (format[2]) { + case 'r': { // IBr. + AppendToOutput("#%d", r); + return 3; + } + case 's': { // IBs+1 or IBs-r+1. + if (format[3] == '+') { + AppendToOutput("#%d", s + 1); + return 5; + } else { + VIXL_ASSERT(format[3] == '-'); + AppendToOutput("#%d", s - r + 1); + return 7; + } + } + case 'Z': { // IBZ-r. + VIXL_ASSERT((format[3] == '-') && (format[4] == 'r')); + unsigned reg_size = (instr->SixtyFourBits() == 1) ? kXRegSize : kWRegSize; + AppendToOutput("#%d", reg_size - r); + return 5; + } + default: { + VIXL_UNREACHABLE(); + return 0; + } + } +} + + +int Disassembler::SubstituteLiteralField(Instruction* instr, + const char* format) { + VIXL_ASSERT(strncmp(format, "LValue", 6) == 0); + USE(format); + + switch (instr->Mask(LoadLiteralMask)) { + case LDR_w_lit: + case LDR_x_lit: + case LDR_s_lit: + case LDR_d_lit: AppendToOutput("(addr %p)", instr->LiteralAddress()); break; + default: VIXL_UNREACHABLE(); + } + + return 6; +} + + +int Disassembler::SubstituteShiftField(Instruction* instr, const char* format) { + VIXL_ASSERT(format[0] == 'H'); + VIXL_ASSERT(instr->ShiftDP() <= 0x3); + + switch (format[1]) { + case 'D': { // HDP. + VIXL_ASSERT(instr->ShiftDP() != ROR); + } // Fall through. + case 'L': { // HLo. + if (instr->ImmDPShift() != 0) { + const char* shift_type[] = {"lsl", "lsr", "asr", "ror"}; + AppendToOutput(", %s #%" PRId64, shift_type[instr->ShiftDP()], + instr->ImmDPShift()); + } + return 3; + } + default: + VIXL_UNIMPLEMENTED(); + return 0; + } +} + + +int Disassembler::SubstituteConditionField(Instruction* instr, + const char* format) { + VIXL_ASSERT(format[0] == 'C'); + const char* condition_code[] = { "eq", "ne", "hs", "lo", + "mi", "pl", "vs", "vc", + "hi", "ls", "ge", "lt", + "gt", "le", "al", "nv" }; + int cond; + switch (format[1]) { + case 'B': cond = instr->ConditionBranch(); break; + case 'I': { + cond = InvertCondition(static_cast(instr->Condition())); + break; + } + default: cond = instr->Condition(); + } + AppendToOutput("%s", condition_code[cond]); + return 4; +} + + +int Disassembler::SubstitutePCRelAddressField(Instruction* instr, + const char* format) { + USE(format); + VIXL_ASSERT(strncmp(format, "AddrPCRel", 9) == 0); + + int offset = instr->ImmPCRel(); + + // Only ADR (AddrPCRelByte) is supported. + VIXL_ASSERT(strcmp(format, "AddrPCRelByte") == 0); + + char sign = '+'; + if (offset < 0) { + offset = -offset; + sign = '-'; + } + VIXL_STATIC_ASSERT(sizeof(*instr) == 1); + AppendToOutput("#%c0x%x (addr %p)", sign, offset, instr + offset); + return 13; +} + + +int Disassembler::SubstituteBranchTargetField(Instruction* instr, + const char* format) { + VIXL_ASSERT(strncmp(format, "BImm", 4) == 0); + + int64_t offset = 0; + switch (format[5]) { + // BImmUncn - unconditional branch immediate. + case 'n': offset = instr->ImmUncondBranch(); break; + // BImmCond - conditional branch immediate. + case 'o': offset = instr->ImmCondBranch(); break; + // BImmCmpa - compare and branch immediate. + case 'm': offset = instr->ImmCmpBranch(); break; + // BImmTest - test and branch immediate. + case 'e': offset = instr->ImmTestBranch(); break; + default: VIXL_UNIMPLEMENTED(); + } + offset <<= kInstructionSizeLog2; + char sign = '+'; + if (offset < 0) { + offset = -offset; + sign = '-'; + } + VIXL_STATIC_ASSERT(sizeof(*instr) == 1); + AppendToOutput("#%c0x%" PRIx64 " (addr %p)", sign, offset, instr + offset); + return 8; +} + + +int Disassembler::SubstituteExtendField(Instruction* instr, + const char* format) { + VIXL_ASSERT(strncmp(format, "Ext", 3) == 0); + VIXL_ASSERT(instr->ExtendMode() <= 7); + USE(format); + + const char* extend_mode[] = { "uxtb", "uxth", "uxtw", "uxtx", + "sxtb", "sxth", "sxtw", "sxtx" }; + + // If rd or rn is SP, uxtw on 32-bit registers and uxtx on 64-bit + // registers becomes lsl. + if (((instr->Rd() == kZeroRegCode) || (instr->Rn() == kZeroRegCode)) && + (((instr->ExtendMode() == UXTW) && (instr->SixtyFourBits() == 0)) || + (instr->ExtendMode() == UXTX))) { + if (instr->ImmExtendShift() > 0) { + AppendToOutput(", lsl #%d", instr->ImmExtendShift()); + } + } else { + AppendToOutput(", %s", extend_mode[instr->ExtendMode()]); + if (instr->ImmExtendShift() > 0) { + AppendToOutput(" #%d", instr->ImmExtendShift()); + } + } + return 3; +} + + +int Disassembler::SubstituteLSRegOffsetField(Instruction* instr, + const char* format) { + VIXL_ASSERT(strncmp(format, "Offsetreg", 9) == 0); + const char* extend_mode[] = { "undefined", "undefined", "uxtw", "lsl", + "undefined", "undefined", "sxtw", "sxtx" }; + USE(format); + + unsigned shift = instr->ImmShiftLS(); + Extend ext = static_cast(instr->ExtendMode()); + char reg_type = ((ext == UXTW) || (ext == SXTW)) ? 'w' : 'x'; + + unsigned rm = instr->Rm(); + if (rm == kZeroRegCode) { + AppendToOutput("%czr", reg_type); + } else { + AppendToOutput("%c%d", reg_type, rm); + } + + // Extend mode UXTX is an alias for shift mode LSL here. + if (!((ext == UXTX) && (shift == 0))) { + AppendToOutput(", %s", extend_mode[ext]); + if (shift != 0) { + AppendToOutput(" #%d", instr->SizeLS()); + } + } + return 9; +} + + +int Disassembler::SubstitutePrefetchField(Instruction* instr, + const char* format) { + VIXL_ASSERT(format[0] == 'P'); + USE(format); + + int prefetch_mode = instr->PrefetchMode(); + + const char* ls = (prefetch_mode & 0x10) ? "st" : "ld"; + int level = (prefetch_mode >> 1) + 1; + const char* ks = (prefetch_mode & 1) ? "strm" : "keep"; + + AppendToOutput("p%sl%d%s", ls, level, ks); + return 6; +} + +int Disassembler::SubstituteBarrierField(Instruction* instr, + const char* format) { + VIXL_ASSERT(format[0] == 'M'); + USE(format); + + static const char* options[4][4] = { + { "sy (0b0000)", "oshld", "oshst", "osh" }, + { "sy (0b0100)", "nshld", "nshst", "nsh" }, + { "sy (0b1000)", "ishld", "ishst", "ish" }, + { "sy (0b1100)", "ld", "st", "sy" } + }; + int domain = instr->ImmBarrierDomain(); + int type = instr->ImmBarrierType(); + + AppendToOutput("%s", options[domain][type]); + return 1; +} + +void Disassembler::ResetOutput() { + buffer_pos_ = 0; + buffer_[buffer_pos_] = 0; +} + + +void Disassembler::AppendToOutput(const char* format, ...) { + va_list args; + va_start(args, format); + buffer_pos_ += vsnprintf(&buffer_[buffer_pos_], buffer_size_, format, args); + va_end(args); +} + + +void PrintDisassembler::ProcessOutput(Instruction* instr) { + fprintf(stream_, "0x%016" PRIx64 " %08" PRIx32 "\t\t%s\n", + reinterpret_cast(instr), + instr->InstructionBits(), + GetOutput()); +} +} // namespace vixl diff --git a/disas/libvixl/a64/disasm-a64.h b/disas/libvixl/a64/disasm-a64.h new file mode 100644 index 0000000000..3a56e15515 --- /dev/null +++ b/disas/libvixl/a64/disasm-a64.h @@ -0,0 +1,110 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_A64_DISASM_A64_H +#define VIXL_A64_DISASM_A64_H + +#include "globals.h" +#include "utils.h" +#include "instructions-a64.h" +#include "decoder-a64.h" + +namespace vixl { + +class Disassembler: public DecoderVisitor { + public: + Disassembler(); + Disassembler(char* text_buffer, int buffer_size); + virtual ~Disassembler(); + char* GetOutput(); + + // Declare all Visitor functions. + #define DECLARE(A) void Visit##A(Instruction* instr); + VISITOR_LIST(DECLARE) + #undef DECLARE + + protected: + virtual void ProcessOutput(Instruction* instr); + + private: + void Format(Instruction* instr, const char* mnemonic, const char* format); + void Substitute(Instruction* instr, const char* string); + int SubstituteField(Instruction* instr, const char* format); + int SubstituteRegisterField(Instruction* instr, const char* format); + int SubstituteImmediateField(Instruction* instr, const char* format); + int SubstituteLiteralField(Instruction* instr, const char* format); + int SubstituteBitfieldImmediateField(Instruction* instr, const char* format); + int SubstituteShiftField(Instruction* instr, const char* format); + int SubstituteExtendField(Instruction* instr, const char* format); + int SubstituteConditionField(Instruction* instr, const char* format); + int SubstitutePCRelAddressField(Instruction* instr, const char* format); + int SubstituteBranchTargetField(Instruction* instr, const char* format); + int SubstituteLSRegOffsetField(Instruction* instr, const char* format); + int SubstitutePrefetchField(Instruction* instr, const char* format); + int SubstituteBarrierField(Instruction* instr, const char* format); + + inline bool RdIsZROrSP(Instruction* instr) const { + return (instr->Rd() == kZeroRegCode); + } + + inline bool RnIsZROrSP(Instruction* instr) const { + return (instr->Rn() == kZeroRegCode); + } + + inline bool RmIsZROrSP(Instruction* instr) const { + return (instr->Rm() == kZeroRegCode); + } + + inline bool RaIsZROrSP(Instruction* instr) const { + return (instr->Ra() == kZeroRegCode); + } + + bool IsMovzMovnImm(unsigned reg_size, uint64_t value); + + void ResetOutput(); + void AppendToOutput(const char* string, ...); + + char* buffer_; + uint32_t buffer_pos_; + uint32_t buffer_size_; + bool own_buffer_; +}; + + +class PrintDisassembler: public Disassembler { + public: + explicit PrintDisassembler(FILE* stream) : stream_(stream) { } + ~PrintDisassembler() { } + + protected: + virtual void ProcessOutput(Instruction* instr); + + private: + FILE *stream_; +}; +} // namespace vixl + +#endif // VIXL_A64_DISASM_A64_H diff --git a/disas/libvixl/a64/instructions-a64.cc b/disas/libvixl/a64/instructions-a64.cc new file mode 100644 index 0000000000..c4eb7c4518 --- /dev/null +++ b/disas/libvixl/a64/instructions-a64.cc @@ -0,0 +1,238 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "a64/instructions-a64.h" +#include "a64/assembler-a64.h" + +namespace vixl { + + +static uint64_t RotateRight(uint64_t value, + unsigned int rotate, + unsigned int width) { + VIXL_ASSERT(width <= 64); + rotate &= 63; + return ((value & ((UINT64_C(1) << rotate) - 1)) << + (width - rotate)) | (value >> rotate); +} + + +static uint64_t RepeatBitsAcrossReg(unsigned reg_size, + uint64_t value, + unsigned width) { + VIXL_ASSERT((width == 2) || (width == 4) || (width == 8) || (width == 16) || + (width == 32)); + VIXL_ASSERT((reg_size == kWRegSize) || (reg_size == kXRegSize)); + uint64_t result = value & ((UINT64_C(1) << width) - 1); + for (unsigned i = width; i < reg_size; i *= 2) { + result |= (result << i); + } + return result; +} + + +// Logical immediates can't encode zero, so a return value of zero is used to +// indicate a failure case. Specifically, where the constraints on imm_s are +// not met. +uint64_t Instruction::ImmLogical() { + unsigned reg_size = SixtyFourBits() ? kXRegSize : kWRegSize; + int64_t n = BitN(); + int64_t imm_s = ImmSetBits(); + int64_t imm_r = ImmRotate(); + + // An integer is constructed from the n, imm_s and imm_r bits according to + // the following table: + // + // N imms immr size S R + // 1 ssssss rrrrrr 64 UInt(ssssss) UInt(rrrrrr) + // 0 0sssss xrrrrr 32 UInt(sssss) UInt(rrrrr) + // 0 10ssss xxrrrr 16 UInt(ssss) UInt(rrrr) + // 0 110sss xxxrrr 8 UInt(sss) UInt(rrr) + // 0 1110ss xxxxrr 4 UInt(ss) UInt(rr) + // 0 11110s xxxxxr 2 UInt(s) UInt(r) + // (s bits must not be all set) + // + // A pattern is constructed of size bits, where the least significant S+1 + // bits are set. The pattern is rotated right by R, and repeated across a + // 32 or 64-bit value, depending on destination register width. + // + + if (n == 1) { + if (imm_s == 0x3F) { + return 0; + } + uint64_t bits = (UINT64_C(1) << (imm_s + 1)) - 1; + return RotateRight(bits, imm_r, 64); + } else { + if ((imm_s >> 1) == 0x1F) { + return 0; + } + for (int width = 0x20; width >= 0x2; width >>= 1) { + if ((imm_s & width) == 0) { + int mask = width - 1; + if ((imm_s & mask) == mask) { + return 0; + } + uint64_t bits = (UINT64_C(1) << ((imm_s & mask) + 1)) - 1; + return RepeatBitsAcrossReg(reg_size, + RotateRight(bits, imm_r & mask, width), + width); + } + } + } + VIXL_UNREACHABLE(); + return 0; +} + + +float Instruction::ImmFP32() { + // ImmFP: abcdefgh (8 bits) + // Single: aBbb.bbbc.defg.h000.0000.0000.0000.0000 (32 bits) + // where B is b ^ 1 + uint32_t bits = ImmFP(); + uint32_t bit7 = (bits >> 7) & 0x1; + uint32_t bit6 = (bits >> 6) & 0x1; + uint32_t bit5_to_0 = bits & 0x3f; + uint32_t result = (bit7 << 31) | ((32 - bit6) << 25) | (bit5_to_0 << 19); + + return rawbits_to_float(result); +} + + +double Instruction::ImmFP64() { + // ImmFP: abcdefgh (8 bits) + // Double: aBbb.bbbb.bbcd.efgh.0000.0000.0000.0000 + // 0000.0000.0000.0000.0000.0000.0000.0000 (64 bits) + // where B is b ^ 1 + uint32_t bits = ImmFP(); + uint64_t bit7 = (bits >> 7) & 0x1; + uint64_t bit6 = (bits >> 6) & 0x1; + uint64_t bit5_to_0 = bits & 0x3f; + uint64_t result = (bit7 << 63) | ((256 - bit6) << 54) | (bit5_to_0 << 48); + + return rawbits_to_double(result); +} + + +LSDataSize CalcLSPairDataSize(LoadStorePairOp op) { + switch (op) { + case STP_x: + case LDP_x: + case STP_d: + case LDP_d: return LSDoubleWord; + default: return LSWord; + } +} + + +Instruction* Instruction::ImmPCOffsetTarget() { + ptrdiff_t offset; + if (IsPCRelAddressing()) { + // PC-relative addressing. Only ADR is supported. + offset = ImmPCRel(); + } else { + // All PC-relative branches. + VIXL_ASSERT(BranchType() != UnknownBranchType); + // Relative branch offsets are instruction-size-aligned. + offset = ImmBranch() << kInstructionSizeLog2; + } + return this + offset; +} + + +inline int Instruction::ImmBranch() const { + switch (BranchType()) { + case CondBranchType: return ImmCondBranch(); + case UncondBranchType: return ImmUncondBranch(); + case CompareBranchType: return ImmCmpBranch(); + case TestBranchType: return ImmTestBranch(); + default: VIXL_UNREACHABLE(); + } + return 0; +} + + +void Instruction::SetImmPCOffsetTarget(Instruction* target) { + if (IsPCRelAddressing()) { + SetPCRelImmTarget(target); + } else { + SetBranchImmTarget(target); + } +} + + +void Instruction::SetPCRelImmTarget(Instruction* target) { + // ADRP is not supported, so 'this' must point to an ADR instruction. + VIXL_ASSERT(Mask(PCRelAddressingMask) == ADR); + + Instr imm = Assembler::ImmPCRelAddress(target - this); + + SetInstructionBits(Mask(~ImmPCRel_mask) | imm); +} + + +void Instruction::SetBranchImmTarget(Instruction* target) { + VIXL_ASSERT(((target - this) & 3) == 0); + Instr branch_imm = 0; + uint32_t imm_mask = 0; + int offset = (target - this) >> kInstructionSizeLog2; + switch (BranchType()) { + case CondBranchType: { + branch_imm = Assembler::ImmCondBranch(offset); + imm_mask = ImmCondBranch_mask; + break; + } + case UncondBranchType: { + branch_imm = Assembler::ImmUncondBranch(offset); + imm_mask = ImmUncondBranch_mask; + break; + } + case CompareBranchType: { + branch_imm = Assembler::ImmCmpBranch(offset); + imm_mask = ImmCmpBranch_mask; + break; + } + case TestBranchType: { + branch_imm = Assembler::ImmTestBranch(offset); + imm_mask = ImmTestBranch_mask; + break; + } + default: VIXL_UNREACHABLE(); + } + SetInstructionBits(Mask(~imm_mask) | branch_imm); +} + + +void Instruction::SetImmLLiteral(Instruction* source) { + VIXL_ASSERT(((source - this) & 3) == 0); + int offset = (source - this) >> kLiteralEntrySizeLog2; + Instr imm = Assembler::ImmLLiteral(offset); + Instr mask = ImmLLiteral_mask; + + SetInstructionBits(Mask(~mask) | imm); +} +} // namespace vixl + diff --git a/disas/libvixl/a64/instructions-a64.h b/disas/libvixl/a64/instructions-a64.h new file mode 100644 index 0000000000..a4240d7d33 --- /dev/null +++ b/disas/libvixl/a64/instructions-a64.h @@ -0,0 +1,360 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_A64_INSTRUCTIONS_A64_H_ +#define VIXL_A64_INSTRUCTIONS_A64_H_ + +#include "globals.h" +#include "utils.h" +#include "a64/constants-a64.h" + +namespace vixl { +// ISA constants. -------------------------------------------------------------- + +typedef uint32_t Instr; +const unsigned kInstructionSize = 4; +const unsigned kInstructionSizeLog2 = 2; +const unsigned kLiteralEntrySize = 4; +const unsigned kLiteralEntrySizeLog2 = 2; +const unsigned kMaxLoadLiteralRange = 1 * MBytes; + +const unsigned kWRegSize = 32; +const unsigned kWRegSizeLog2 = 5; +const unsigned kWRegSizeInBytes = kWRegSize / 8; +const unsigned kWRegSizeInBytesLog2 = kWRegSizeLog2 - 3; +const unsigned kXRegSize = 64; +const unsigned kXRegSizeLog2 = 6; +const unsigned kXRegSizeInBytes = kXRegSize / 8; +const unsigned kXRegSizeInBytesLog2 = kXRegSizeLog2 - 3; +const unsigned kSRegSize = 32; +const unsigned kSRegSizeLog2 = 5; +const unsigned kSRegSizeInBytes = kSRegSize / 8; +const unsigned kSRegSizeInBytesLog2 = kSRegSizeLog2 - 3; +const unsigned kDRegSize = 64; +const unsigned kDRegSizeLog2 = 6; +const unsigned kDRegSizeInBytes = kDRegSize / 8; +const unsigned kDRegSizeInBytesLog2 = kDRegSizeLog2 - 3; +const uint64_t kWRegMask = UINT64_C(0xffffffff); +const uint64_t kXRegMask = UINT64_C(0xffffffffffffffff); +const uint64_t kSRegMask = UINT64_C(0xffffffff); +const uint64_t kDRegMask = UINT64_C(0xffffffffffffffff); +const uint64_t kSSignMask = UINT64_C(0x80000000); +const uint64_t kDSignMask = UINT64_C(0x8000000000000000); +const uint64_t kWSignMask = UINT64_C(0x80000000); +const uint64_t kXSignMask = UINT64_C(0x8000000000000000); +const uint64_t kByteMask = UINT64_C(0xff); +const uint64_t kHalfWordMask = UINT64_C(0xffff); +const uint64_t kWordMask = UINT64_C(0xffffffff); +const uint64_t kXMaxUInt = UINT64_C(0xffffffffffffffff); +const uint64_t kWMaxUInt = UINT64_C(0xffffffff); +const int64_t kXMaxInt = INT64_C(0x7fffffffffffffff); +const int64_t kXMinInt = INT64_C(0x8000000000000000); +const int32_t kWMaxInt = INT32_C(0x7fffffff); +const int32_t kWMinInt = INT32_C(0x80000000); +const unsigned kLinkRegCode = 30; +const unsigned kZeroRegCode = 31; +const unsigned kSPRegInternalCode = 63; +const unsigned kRegCodeMask = 0x1f; + +// AArch64 floating-point specifics. These match IEEE-754. +const unsigned kDoubleMantissaBits = 52; +const unsigned kDoubleExponentBits = 11; +const unsigned kFloatMantissaBits = 23; +const unsigned kFloatExponentBits = 8; + +const float kFP32PositiveInfinity = rawbits_to_float(0x7f800000); +const float kFP32NegativeInfinity = rawbits_to_float(0xff800000); +const double kFP64PositiveInfinity = + rawbits_to_double(UINT64_C(0x7ff0000000000000)); +const double kFP64NegativeInfinity = + rawbits_to_double(UINT64_C(0xfff0000000000000)); + +// This value is a signalling NaN as both a double and as a float (taking the +// least-significant word). +static const double kFP64SignallingNaN = + rawbits_to_double(UINT64_C(0x7ff000007f800001)); +static const float kFP32SignallingNaN = rawbits_to_float(0x7f800001); + +// A similar value, but as a quiet NaN. +static const double kFP64QuietNaN = + rawbits_to_double(UINT64_C(0x7ff800007fc00001)); +static const float kFP32QuietNaN = rawbits_to_float(0x7fc00001); + +// The default NaN values (for FPCR.DN=1). +static const double kFP64DefaultNaN = + rawbits_to_double(UINT64_C(0x7ff8000000000000)); +static const float kFP32DefaultNaN = rawbits_to_float(0x7fc00000); + + +enum LSDataSize { + LSByte = 0, + LSHalfword = 1, + LSWord = 2, + LSDoubleWord = 3 +}; + +LSDataSize CalcLSPairDataSize(LoadStorePairOp op); + +enum ImmBranchType { + UnknownBranchType = 0, + CondBranchType = 1, + UncondBranchType = 2, + CompareBranchType = 3, + TestBranchType = 4 +}; + +enum AddrMode { + Offset, + PreIndex, + PostIndex +}; + +enum FPRounding { + // The first four values are encodable directly by FPCR. + FPTieEven = 0x0, + FPPositiveInfinity = 0x1, + FPNegativeInfinity = 0x2, + FPZero = 0x3, + + // The final rounding mode is only available when explicitly specified by the + // instruction (such as with fcvta). It cannot be set in FPCR. + FPTieAway +}; + +enum Reg31Mode { + Reg31IsStackPointer, + Reg31IsZeroRegister +}; + +// Instructions. --------------------------------------------------------------- + +class Instruction { + public: + inline Instr InstructionBits() const { + return *(reinterpret_cast(this)); + } + + inline void SetInstructionBits(Instr new_instr) { + *(reinterpret_cast(this)) = new_instr; + } + + inline int Bit(int pos) const { + return (InstructionBits() >> pos) & 1; + } + + inline uint32_t Bits(int msb, int lsb) const { + return unsigned_bitextract_32(msb, lsb, InstructionBits()); + } + + inline int32_t SignedBits(int msb, int lsb) const { + int32_t bits = *(reinterpret_cast(this)); + return signed_bitextract_32(msb, lsb, bits); + } + + inline Instr Mask(uint32_t mask) const { + return InstructionBits() & mask; + } + + #define DEFINE_GETTER(Name, HighBit, LowBit, Func) \ + inline int64_t Name() const { return Func(HighBit, LowBit); } + INSTRUCTION_FIELDS_LIST(DEFINE_GETTER) + #undef DEFINE_GETTER + + // ImmPCRel is a compound field (not present in INSTRUCTION_FIELDS_LIST), + // formed from ImmPCRelLo and ImmPCRelHi. + int ImmPCRel() const { + int const offset = ((ImmPCRelHi() << ImmPCRelLo_width) | ImmPCRelLo()); + int const width = ImmPCRelLo_width + ImmPCRelHi_width; + return signed_bitextract_32(width-1, 0, offset); + } + + uint64_t ImmLogical(); + float ImmFP32(); + double ImmFP64(); + + inline LSDataSize SizeLSPair() const { + return CalcLSPairDataSize( + static_cast(Mask(LoadStorePairMask))); + } + + // Helpers. + inline bool IsCondBranchImm() const { + return Mask(ConditionalBranchFMask) == ConditionalBranchFixed; + } + + inline bool IsUncondBranchImm() const { + return Mask(UnconditionalBranchFMask) == UnconditionalBranchFixed; + } + + inline bool IsCompareBranch() const { + return Mask(CompareBranchFMask) == CompareBranchFixed; + } + + inline bool IsTestBranch() const { + return Mask(TestBranchFMask) == TestBranchFixed; + } + + inline bool IsPCRelAddressing() const { + return Mask(PCRelAddressingFMask) == PCRelAddressingFixed; + } + + inline bool IsLogicalImmediate() const { + return Mask(LogicalImmediateFMask) == LogicalImmediateFixed; + } + + inline bool IsAddSubImmediate() const { + return Mask(AddSubImmediateFMask) == AddSubImmediateFixed; + } + + inline bool IsAddSubExtended() const { + return Mask(AddSubExtendedFMask) == AddSubExtendedFixed; + } + + inline bool IsLoadOrStore() const { + return Mask(LoadStoreAnyFMask) == LoadStoreAnyFixed; + } + + inline bool IsMovn() const { + return (Mask(MoveWideImmediateMask) == MOVN_x) || + (Mask(MoveWideImmediateMask) == MOVN_w); + } + + // Indicate whether Rd can be the stack pointer or the zero register. This + // does not check that the instruction actually has an Rd field. + inline Reg31Mode RdMode() const { + // The following instructions use sp or wsp as Rd: + // Add/sub (immediate) when not setting the flags. + // Add/sub (extended) when not setting the flags. + // Logical (immediate) when not setting the flags. + // Otherwise, r31 is the zero register. + if (IsAddSubImmediate() || IsAddSubExtended()) { + if (Mask(AddSubSetFlagsBit)) { + return Reg31IsZeroRegister; + } else { + return Reg31IsStackPointer; + } + } + if (IsLogicalImmediate()) { + // Of the logical (immediate) instructions, only ANDS (and its aliases) + // can set the flags. The others can all write into sp. + // Note that some logical operations are not available to + // immediate-operand instructions, so we have to combine two masks here. + if (Mask(LogicalImmediateMask & LogicalOpMask) == ANDS) { + return Reg31IsZeroRegister; + } else { + return Reg31IsStackPointer; + } + } + return Reg31IsZeroRegister; + } + + // Indicate whether Rn can be the stack pointer or the zero register. This + // does not check that the instruction actually has an Rn field. + inline Reg31Mode RnMode() const { + // The following instructions use sp or wsp as Rn: + // All loads and stores. + // Add/sub (immediate). + // Add/sub (extended). + // Otherwise, r31 is the zero register. + if (IsLoadOrStore() || IsAddSubImmediate() || IsAddSubExtended()) { + return Reg31IsStackPointer; + } + return Reg31IsZeroRegister; + } + + inline ImmBranchType BranchType() const { + if (IsCondBranchImm()) { + return CondBranchType; + } else if (IsUncondBranchImm()) { + return UncondBranchType; + } else if (IsCompareBranch()) { + return CompareBranchType; + } else if (IsTestBranch()) { + return TestBranchType; + } else { + return UnknownBranchType; + } + } + + // Find the target of this instruction. 'this' may be a branch or a + // PC-relative addressing instruction. + Instruction* ImmPCOffsetTarget(); + + // Patch a PC-relative offset to refer to 'target'. 'this' may be a branch or + // a PC-relative addressing instruction. + void SetImmPCOffsetTarget(Instruction* target); + // Patch a literal load instruction to load from 'source'. + void SetImmLLiteral(Instruction* source); + + inline uint8_t* LiteralAddress() { + int offset = ImmLLiteral() << kLiteralEntrySizeLog2; + return reinterpret_cast(this) + offset; + } + + inline uint32_t Literal32() { + uint32_t literal; + memcpy(&literal, LiteralAddress(), sizeof(literal)); + + return literal; + } + + inline uint64_t Literal64() { + uint64_t literal; + memcpy(&literal, LiteralAddress(), sizeof(literal)); + + return literal; + } + + inline float LiteralFP32() { + return rawbits_to_float(Literal32()); + } + + inline double LiteralFP64() { + return rawbits_to_double(Literal64()); + } + + inline Instruction* NextInstruction() { + return this + kInstructionSize; + } + + inline Instruction* InstructionAtOffset(int64_t offset) { + VIXL_ASSERT(IsWordAligned(this + offset)); + return this + offset; + } + + template static inline Instruction* Cast(T src) { + return reinterpret_cast(src); + } + + private: + inline int ImmBranch() const; + + void SetPCRelImmTarget(Instruction* target); + void SetBranchImmTarget(Instruction* target); +}; +} // namespace vixl + +#endif // VIXL_A64_INSTRUCTIONS_A64_H_ diff --git a/disas/libvixl/globals.h b/disas/libvixl/globals.h new file mode 100644 index 0000000000..e28dc6663a --- /dev/null +++ b/disas/libvixl/globals.h @@ -0,0 +1,85 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_GLOBALS_H +#define VIXL_GLOBALS_H + +// Get standard C99 macros for integer types. +#ifndef __STDC_CONSTANT_MACROS +#define __STDC_CONSTANT_MACROS +#endif + +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS +#endif + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include "platform.h" + + +typedef uint8_t byte; + +const int KBytes = 1024; +const int MBytes = 1024 * KBytes; + +#define VIXL_ABORT() printf("in %s, line %i", __FILE__, __LINE__); abort() +#ifdef DEBUG + #define VIXL_ASSERT(condition) assert(condition) + #define VIXL_CHECK(condition) VIXL_ASSERT(condition) + #define VIXL_UNIMPLEMENTED() printf("UNIMPLEMENTED\t"); VIXL_ABORT() + #define VIXL_UNREACHABLE() printf("UNREACHABLE\t"); VIXL_ABORT() +#else + #define VIXL_ASSERT(condition) ((void) 0) + #define VIXL_CHECK(condition) assert(condition) + #define VIXL_UNIMPLEMENTED() ((void) 0) + #define VIXL_UNREACHABLE() ((void) 0) +#endif +// This is not as powerful as template based assertions, but it is simple. +// It assumes that the descriptions are unique. If this starts being a problem, +// we can switch to a different implemention. +#define VIXL_CONCAT(a, b) a##b +#define VIXL_STATIC_ASSERT_LINE(line, condition) \ + typedef char VIXL_CONCAT(STATIC_ASSERT_LINE_, line)[(condition) ? 1 : -1] \ + __attribute__((unused)) +#define VIXL_STATIC_ASSERT(condition) VIXL_STATIC_ASSERT_LINE(__LINE__, condition) //NOLINT + +template inline void USE(T) {} + +#define VIXL_ALIGNMENT_EXCEPTION() printf("ALIGNMENT EXCEPTION\t"); VIXL_ABORT() + +#endif // VIXL_GLOBALS_H diff --git a/disas/libvixl/platform.h b/disas/libvixl/platform.h new file mode 100644 index 0000000000..b5c2085b90 --- /dev/null +++ b/disas/libvixl/platform.h @@ -0,0 +1,41 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef PLATFORM_H +#define PLATFORM_H + +// Define platform specific functionalities. + +namespace vixl { +#ifdef USE_SIMULATOR +// Currently we assume running the simulator implies running on x86 hardware. +inline void HostBreakpoint() { asm("int3"); } +#else +inline void HostBreakpoint() { asm("brk"); } +#endif +} // namespace vixl + +#endif diff --git a/disas/libvixl/utils.cc b/disas/libvixl/utils.cc new file mode 100644 index 0000000000..c9c05d1e18 --- /dev/null +++ b/disas/libvixl/utils.cc @@ -0,0 +1,127 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "utils.h" +#include + +namespace vixl { + +uint32_t float_to_rawbits(float value) { + uint32_t bits = 0; + memcpy(&bits, &value, 4); + return bits; +} + + +uint64_t double_to_rawbits(double value) { + uint64_t bits = 0; + memcpy(&bits, &value, 8); + return bits; +} + + +float rawbits_to_float(uint32_t bits) { + float value = 0.0; + memcpy(&value, &bits, 4); + return value; +} + + +double rawbits_to_double(uint64_t bits) { + double value = 0.0; + memcpy(&value, &bits, 8); + return value; +} + + +int CountLeadingZeros(uint64_t value, int width) { + VIXL_ASSERT((width == 32) || (width == 64)); + int count = 0; + uint64_t bit_test = UINT64_C(1) << (width - 1); + while ((count < width) && ((bit_test & value) == 0)) { + count++; + bit_test >>= 1; + } + return count; +} + + +int CountLeadingSignBits(int64_t value, int width) { + VIXL_ASSERT((width == 32) || (width == 64)); + if (value >= 0) { + return CountLeadingZeros(value, width) - 1; + } else { + return CountLeadingZeros(~value, width) - 1; + } +} + + +int CountTrailingZeros(uint64_t value, int width) { + VIXL_ASSERT((width == 32) || (width == 64)); + int count = 0; + while ((count < width) && (((value >> count) & 1) == 0)) { + count++; + } + return count; +} + + +int CountSetBits(uint64_t value, int width) { + // TODO: Other widths could be added here, as the implementation already + // supports them. + VIXL_ASSERT((width == 32) || (width == 64)); + + // Mask out unused bits to ensure that they are not counted. + value &= (UINT64_C(0xffffffffffffffff) >> (64-width)); + + // Add up the set bits. + // The algorithm works by adding pairs of bit fields together iteratively, + // where the size of each bit field doubles each time. + // An example for an 8-bit value: + // Bits: h g f e d c b a + // \ | \ | \ | \ | + // value = h+g f+e d+c b+a + // \ | \ | + // value = h+g+f+e d+c+b+a + // \ | + // value = h+g+f+e+d+c+b+a + const uint64_t kMasks[] = { + UINT64_C(0x5555555555555555), + UINT64_C(0x3333333333333333), + UINT64_C(0x0f0f0f0f0f0f0f0f), + UINT64_C(0x00ff00ff00ff00ff), + UINT64_C(0x0000ffff0000ffff), + UINT64_C(0x00000000ffffffff), + }; + + for (unsigned i = 0; i < (sizeof(kMasks) / sizeof(kMasks[0])); i++) { + int shift = 1 << i; + value = ((value >> shift) & kMasks[i]) + (value & kMasks[i]); + } + + return value; +} +} // namespace vixl diff --git a/disas/libvixl/utils.h b/disas/libvixl/utils.h new file mode 100644 index 0000000000..83c928c8e3 --- /dev/null +++ b/disas/libvixl/utils.h @@ -0,0 +1,190 @@ +// Copyright 2013, ARM Limited +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// * Neither the name of ARM Limited nor the names of its contributors may be +// used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND +// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#ifndef VIXL_UTILS_H +#define VIXL_UTILS_H + +#include +#include +#include "globals.h" + +namespace vixl { + +// Check number width. +inline bool is_intn(unsigned n, int64_t x) { + VIXL_ASSERT((0 < n) && (n < 64)); + int64_t limit = INT64_C(1) << (n - 1); + return (-limit <= x) && (x < limit); +} + +inline bool is_uintn(unsigned n, int64_t x) { + VIXL_ASSERT((0 < n) && (n < 64)); + return !(x >> n); +} + +inline unsigned truncate_to_intn(unsigned n, int64_t x) { + VIXL_ASSERT((0 < n) && (n < 64)); + return (x & ((INT64_C(1) << n) - 1)); +} + +#define INT_1_TO_63_LIST(V) \ +V(1) V(2) V(3) V(4) V(5) V(6) V(7) V(8) \ +V(9) V(10) V(11) V(12) V(13) V(14) V(15) V(16) \ +V(17) V(18) V(19) V(20) V(21) V(22) V(23) V(24) \ +V(25) V(26) V(27) V(28) V(29) V(30) V(31) V(32) \ +V(33) V(34) V(35) V(36) V(37) V(38) V(39) V(40) \ +V(41) V(42) V(43) V(44) V(45) V(46) V(47) V(48) \ +V(49) V(50) V(51) V(52) V(53) V(54) V(55) V(56) \ +V(57) V(58) V(59) V(60) V(61) V(62) V(63) + +#define DECLARE_IS_INT_N(N) \ +inline bool is_int##N(int64_t x) { return is_intn(N, x); } +#define DECLARE_IS_UINT_N(N) \ +inline bool is_uint##N(int64_t x) { return is_uintn(N, x); } +#define DECLARE_TRUNCATE_TO_INT_N(N) \ +inline int truncate_to_int##N(int x) { return truncate_to_intn(N, x); } +INT_1_TO_63_LIST(DECLARE_IS_INT_N) +INT_1_TO_63_LIST(DECLARE_IS_UINT_N) +INT_1_TO_63_LIST(DECLARE_TRUNCATE_TO_INT_N) +#undef DECLARE_IS_INT_N +#undef DECLARE_IS_UINT_N +#undef DECLARE_TRUNCATE_TO_INT_N + +// Bit field extraction. +inline uint32_t unsigned_bitextract_32(int msb, int lsb, uint32_t x) { + return (x >> lsb) & ((1 << (1 + msb - lsb)) - 1); +} + +inline uint64_t unsigned_bitextract_64(int msb, int lsb, uint64_t x) { + return (x >> lsb) & ((static_cast(1) << (1 + msb - lsb)) - 1); +} + +inline int32_t signed_bitextract_32(int msb, int lsb, int32_t x) { + return (x << (31 - msb)) >> (lsb + 31 - msb); +} + +inline int64_t signed_bitextract_64(int msb, int lsb, int64_t x) { + return (x << (63 - msb)) >> (lsb + 63 - msb); +} + +// Floating point representation. +uint32_t float_to_rawbits(float value); +uint64_t double_to_rawbits(double value); +float rawbits_to_float(uint32_t bits); +double rawbits_to_double(uint64_t bits); + + +// NaN tests. +inline bool IsSignallingNaN(double num) { + const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000); + uint64_t raw = double_to_rawbits(num); + if (isnan(num) && ((raw & kFP64QuietNaNMask) == 0)) { + return true; + } + return false; +} + + +inline bool IsSignallingNaN(float num) { + const uint32_t kFP32QuietNaNMask = 0x00400000; + uint32_t raw = float_to_rawbits(num); + if (isnan(num) && ((raw & kFP32QuietNaNMask) == 0)) { + return true; + } + return false; +} + + +template +inline bool IsQuietNaN(T num) { + return isnan(num) && !IsSignallingNaN(num); +} + + +// Convert the NaN in 'num' to a quiet NaN. +inline double ToQuietNaN(double num) { + const uint64_t kFP64QuietNaNMask = UINT64_C(0x0008000000000000); + VIXL_ASSERT(isnan(num)); + return rawbits_to_double(double_to_rawbits(num) | kFP64QuietNaNMask); +} + + +inline float ToQuietNaN(float num) { + const uint32_t kFP32QuietNaNMask = 0x00400000; + VIXL_ASSERT(isnan(num)); + return rawbits_to_float(float_to_rawbits(num) | kFP32QuietNaNMask); +} + + +// Fused multiply-add. +inline double FusedMultiplyAdd(double op1, double op2, double a) { + return fma(op1, op2, a); +} + + +inline float FusedMultiplyAdd(float op1, float op2, float a) { + return fmaf(op1, op2, a); +} + + +// Bit counting. +int CountLeadingZeros(uint64_t value, int width); +int CountLeadingSignBits(int64_t value, int width); +int CountTrailingZeros(uint64_t value, int width); +int CountSetBits(uint64_t value, int width); + +// Pointer alignment +// TODO: rename/refactor to make it specific to instructions. +template +bool IsWordAligned(T pointer) { + VIXL_ASSERT(sizeof(pointer) == sizeof(intptr_t)); // NOLINT(runtime/sizeof) + return (reinterpret_cast(pointer) & 3) == 0; +} + +// Increment a pointer until it has the specified alignment. +template +T AlignUp(T pointer, size_t alignment) { + VIXL_STATIC_ASSERT(sizeof(pointer) == sizeof(uintptr_t)); + uintptr_t pointer_raw = reinterpret_cast(pointer); + size_t align_step = (alignment - pointer_raw) % alignment; + VIXL_ASSERT((pointer_raw + align_step) % alignment == 0); + return reinterpret_cast(pointer_raw + align_step); +} + +// Decrement a pointer until it has the specified alignment. +template +T AlignDown(T pointer, size_t alignment) { + VIXL_STATIC_ASSERT(sizeof(pointer) == sizeof(uintptr_t)); + uintptr_t pointer_raw = reinterpret_cast(pointer); + size_t align_step = pointer_raw % alignment; + VIXL_ASSERT((pointer_raw - align_step) % alignment == 0); + return reinterpret_cast(pointer_raw - align_step); +} + + +} // namespace vixl + +#endif // VIXL_UTILS_H diff --git a/disas/ppc.c b/disas/ppc.c index c149506fd8..99c4cbc3ab 100644 --- a/disas/ppc.c +++ b/disas/ppc.c @@ -5157,7 +5157,8 @@ int print_insn_ppc (bfd_vma memaddr, struct disassemble_info *info) { int dialect = (char *) info->private_data - (char *) 0; - return print_insn_powerpc (memaddr, info, 1, dialect); + return print_insn_powerpc (memaddr, info, info->endian == BFD_ENDIAN_BIG, + dialect); } /* Print a big endian PowerPC instruction. */ diff --git a/dma-helpers.c b/dma-helpers.c index 499550fc23..5f421e9814 100644 --- a/dma-helpers.c +++ b/dma-helpers.c @@ -11,6 +11,7 @@ #include "trace.h" #include "qemu/range.h" #include "qemu/thread.h" +#include "qemu/main-loop.h" /* #define DEBUG_IOMMU */ @@ -212,6 +213,7 @@ BlockDriverAIOCB *dma_bdrv_io( dbs->sg_cur_index = 0; dbs->sg_cur_byte = 0; dbs->dir = dir; + dbs->in_cancel = false; dbs->io_func = io_func; dbs->bh = NULL; qemu_iovec_init(&dbs->iov, sg->nsg); diff --git a/docs/ccid.txt b/docs/ccid.txt index 8bbaa940c3..83c174db26 100644 --- a/docs/ccid.txt +++ b/docs/ccid.txt @@ -52,7 +52,7 @@ Configuring and building: Assuming you have a working smartcard on the host with the current user, using NSS, qemu acts as another NSS client using ccid-card-emulated: - qemu -usb -device usb-ccid -device ccid-card-emualated + qemu -usb -device usb-ccid -device ccid-card-emulated 4. Using ccid-card-emulated with certificates diff --git a/docs/memory.txt b/docs/memory.txt index feb9fe90d7..5bdbdb3691 100644 --- a/docs/memory.txt +++ b/docs/memory.txt @@ -52,6 +52,15 @@ MemoryRegion): hole". Aliases may point to any type of region, including other aliases, but an alias may not point back to itself, directly or indirectly. +It is valid to add subregions to a region which is not a pure container +(that is, to an MMIO, RAM or ROM region). This means that the region +will act like a container, except that any addresses within the container's +region which are not claimed by any subregion are handled by the +container itself (ie by its MMIO callbacks or RAM backing). However +it is generally possible to achieve the same effect with a pure container +one of whose subregions is a low priority "background" region covering +the whole address range; this is often clearer and is preferred. +Subregions cannot be added to an alias region. Region names ------------ @@ -80,6 +89,53 @@ guest. This is done with memory_region_add_subregion_overlap(), which allows the region to overlap any other region in the same container, and specifies a priority that allows the core to decide which of two regions at the same address are visible (highest wins). +Priority values are signed, and the default value is zero. This means that +you can use memory_region_add_subregion_overlap() both to specify a region +that must sit 'above' any others (with a positive priority) and also a +background region that sits 'below' others (with a negative priority). + +If the higher priority region in an overlap is a container or alias, then +the lower priority region will appear in any "holes" that the higher priority +region has left by not mapping subregions to that area of its address range. +(This applies recursively -- if the subregions are themselves containers or +aliases that leave holes then the lower priority region will appear in these +holes too.) + +For example, suppose we have a container A of size 0x8000 with two subregions +B and C. B is a container mapped at 0x2000, size 0x4000, priority 1; C is +an MMIO region mapped at 0x0, size 0x6000, priority 2. B currently has two +of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at +offset 0x2000. As a diagram: + + 0 1000 2000 3000 4000 5000 6000 7000 8000 + |------|------|------|------|------|------|------|-------| + A: [ ] + C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + B: [ ] + D: [DDDDD] + E: [EEEEE] + +The regions that will be seen within this address range then are: + [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC] + +Since B has higher priority than C, its subregions appear in the flat map +even where they overlap with C. In ranges where B has not mapped anything +C's region appears. + +If B had provided its own MMIO operations (ie it was not a pure container) +then these would be used for any addresses in its range not handled by +D or E, and the result would be: + [CCCCCCCCCCCC][DDDDD][BBBBB][EEEEE][BBBBB] + +Priority values are local to a container, because the priorities of two +regions are only compared when they are both children of the same container. +This means that the device in charge of the container (typically modelling +a bus or a memory controller) can use them to manage the interaction of +its child regions without any side effects on other parts of the system. +In the example above, the priorities of D and E are unimportant because +they do not overlap each other. It is the relative priority of B and C +that causes D and E to appear on top of C: D and E's priorities are never +compared against the priority of C. Visibility ---------- @@ -90,11 +146,19 @@ guest accesses an address: descending priority order - if the address lies outside the region offset/size, the subregion is discarded - - if the subregion is a leaf (RAM or MMIO), the search terminates + - if the subregion is a leaf (RAM or MMIO), the search terminates, returning + this leaf region - if the subregion is a container, the same algorithm is used within the subregion (after the address is adjusted by the subregion offset) - - if the subregion is an alias, the search is continues at the alias target + - if the subregion is an alias, the search is continued at the alias target (after the address is adjusted by the subregion offset and alias offset) + - if a recursive search within a container or alias subregion does not + find a match (because of a "hole" in the container's coverage of its + address range), then if this is a container with its own MMIO or RAM + backing the search terminates, returning the container itself. Otherwise + we continue with the next subregion in priority order +- if none of the subregions match the address then the search terminates + with no match found Example memory map ------------------ @@ -168,8 +232,8 @@ various constraints can be supplied to control how these callbacks are called: (in bytes) supported by the *implementation*; other access sizes will be emulated using the ones available. For example a 4-byte write will be emulated using four 1-byte writes, if .impl.max_access_size = 1. - - .impl.valid specifies that the *implementation* only supports unaligned - accesses; unaligned accesses will be emulated by two aligned accesses. - - .old_portio and .old_mmio can be used to ease porting from code using - cpu_register_io_memory() and register_ioport(). They should not be used - in new code. + - .impl.unaligned specifies that the *implementation* supports unaligned + accesses; if false, unaligned accesses will be emulated by two aligned + accesses. + - .old_mmio can be used to ease porting from code using + cpu_register_io_memory(). It should not be used in new code. diff --git a/docs/migration.txt b/docs/migration.txt index 0e0a1d44da..0492a4547a 100644 --- a/docs/migration.txt +++ b/docs/migration.txt @@ -139,8 +139,7 @@ static const VMStateDescription vmstate_kbd = { .name = "pckbd", .version_id = 3, .minimum_version_id = 3, - .minimum_version_id_old = 3, - .fields = (VMStateField []) { + .fields = (VMStateField[]) { VMSTATE_UINT8(write_cmd, KBDState), VMSTATE_UINT8(status, KBDState), VMSTATE_UINT8(mode, KBDState), @@ -168,12 +167,13 @@ You can see that there are several version fields: - minimum_version_id: the minimum version_id that VMState is able to understand for that device. - minimum_version_id_old: For devices that were not able to port to vmstate, we can - assign a function that knows how to read this old state. + assign a function that knows how to read this old state. This field is + ignored if there is no load_state_old handler. So, VMState is able to read versions from minimum_version_id to -version_id. And the function load_state_old() is able to load state -from minimum_version_id_old to minimum_version_id. This function is -deprecated and will be removed when no more users are left. +version_id. And the function load_state_old() (if present) is able to +load state from minimum_version_id_old to minimum_version_id. This +function is deprecated and will be removed when no more users are left. === Massaging functions === @@ -255,10 +255,9 @@ const VMStateDescription vmstate_ide_drive_pio_state = { .name = "ide_drive/pio_state", .version_id = 1, .minimum_version_id = 1, - .minimum_version_id_old = 1, .pre_save = ide_drive_pio_pre_save, .post_load = ide_drive_pio_post_load, - .fields = (VMStateField []) { + .fields = (VMStateField[]) { VMSTATE_INT32(req_nb_sectors, IDEState), VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, vmstate_info_uint8, uint8_t), @@ -275,9 +274,8 @@ const VMStateDescription vmstate_ide_drive = { .name = "ide_drive", .version_id = 3, .minimum_version_id = 0, - .minimum_version_id_old = 0, .post_load = ide_drive_post_load, - .fields = (VMStateField []) { + .fields = (VMStateField[]) { .... several fields .... VMSTATE_END_OF_LIST() }, diff --git a/docs/q35-chipset.cfg b/docs/q35-chipset.cfg index 1b6efc0f2c..e4ddb7d9cc 100644 --- a/docs/q35-chipset.cfg +++ b/docs/q35-chipset.cfg @@ -91,6 +91,29 @@ port = "4" chassis = "4" +## +# Example PCIe switch with two downstream ports +# +#[device "pcie-switch-upstream-port-1"] +# driver = "x3130-upstream" +# bus = "ich9-pcie-port-4" +# addr = "00.0" +# +#[device "pcie-switch-downstream-port-1-1"] +# driver = "xio3130-downstream" +# multifunction = "on" +# bus = "pcie-switch-upstream-port-1" +# addr = "00.0" +# port = "1" +# chassis = "5" +# +#[device "pcie-switch-downstream-port-1-2"] +# driver = "xio3130-downstream" +# multifunction = "on" +# bus = "pcie-switch-upstream-port-1" +# addr = "00.1" +# port = "1" +# chassis = "6" [device "ich9-ehci-1"] driver = "ich9-usb-ehci1" diff --git a/docs/qapi-code-gen.txt b/docs/qapi-code-gen.txt index 0ce045c0b3..26312d84e8 100644 --- a/docs/qapi-code-gen.txt +++ b/docs/qapi-code-gen.txt @@ -40,6 +40,17 @@ enumeration types and union types. Generally speaking, types definitions should always use CamelCase for the type names. Command names should be all lower case with words separated by a hyphen. + +=== Includes === + +The QAPI schema definitions can be modularized using the 'include' directive: + + { 'include': 'path/to/file.json'} + +The directive is evaluated recursively, and include paths are relative to the +file using the directive. + + === Complex types === A complex type is a dictionary containing a single key whose value is a @@ -49,9 +60,50 @@ example of a complex type is: { 'type': 'MyType', 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } } -The use of '*' as a prefix to the name means the member is optional. Optional -members should always be added to the end of the dictionary to preserve -backwards compatibility. +The use of '*' as a prefix to the name means the member is optional. + +The default initialization value of an optional argument should not be changed +between versions of QEMU unless the new default maintains backward +compatibility to the user-visible behavior of the old default. + +With proper documentation, this policy still allows some flexibility; for +example, documenting that a default of 0 picks an optimal buffer size allows +one release to declare the optimal size at 512 while another release declares +the optimal size at 4096 - the user-visible behavior is not the bytes used by +the buffer, but the fact that the buffer was optimal size. + +On input structures (only mentioned in the 'data' side of a command), changing +from mandatory to optional is safe (older clients will supply the option, and +newer clients can benefit from the default); changing from optional to +mandatory is backwards incompatible (older clients may be omitting the option, +and must continue to work). + +On output structures (only mentioned in the 'returns' side of a command), +changing from mandatory to optional is in general unsafe (older clients may be +expecting the field, and could crash if it is missing), although it can be done +if the only way that the optional argument will be omitted is when it is +triggered by the presence of a new input flag to the command that older clients +don't know to send. Changing from optional to mandatory is safe. + +A structure that is used in both input and output of various commands +must consider the backwards compatibility constraints of both directions +of use. + +A complex type definition can specify another complex type as its base. +In this case, the fields of the base type are included as top-level fields +of the new complex type's dictionary in the QMP wire format. An example +definition is: + + { 'type': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } } + { 'type': 'BlockdevOptionsGenericCOWFormat', + 'base': 'BlockdevOptionsGenericFormat', + 'data': { '*backing': 'str' } } + +An example BlockdevOptionsGenericCOWFormat object on the wire could use +both fields like this: + + { "file": "/some/place/my-image", + "backing": "/some/place/my-backing-file" } === Enumeration types === @@ -106,11 +158,12 @@ And it looks like this on the wire: Flat union types avoid the nesting on the wire. They are used whenever a specific field of the base type is declared as the discriminator ('type' is -then no longer generated). The discriminator must always be a string field. +then no longer generated). The discriminator must be of enumeration type. The above example can then be modified as follows: + { 'enum': 'BlockdevDriver', 'data': [ 'raw', 'qcow2' ] } { 'type': 'BlockdevCommonOptions', - 'data': { 'driver': 'str', 'readonly': 'bool' } } + 'data': { 'driver': 'BlockdevDriver', 'readonly': 'bool' } } { 'union': 'BlockdevOptions', 'base': 'BlockdevCommonOptions', 'discriminator': 'driver', @@ -147,7 +200,7 @@ This example allows using both of the following example objects: { "file": "my_existing_block_device_id" } { "file": { "driver": "file", "readonly": false, - 'filename': "/tmp/mydisk.qcow2" } } + "filename": "/tmp/mydisk.qcow2" } } === Commands === @@ -203,7 +256,7 @@ created code. Example: mdroth@illuin:~/w/qemu2.git$ python scripts/qapi-types.py \ - --output-dir="qapi-generated" --prefix="example-" < example-schema.json + --output-dir="qapi-generated" --prefix="example-" --input-file=example-schema.json mdroth@illuin:~/w/qemu2.git$ cat qapi-generated/example-qapi-types.c /* AUTOMATICALLY GENERATED, DO NOT MODIFY */ @@ -273,7 +326,7 @@ $(prefix)qapi-visit.h: declarations for previously mentioned visitor Example: mdroth@illuin:~/w/qemu2.git$ python scripts/qapi-visit.py \ - --output-dir="qapi-generated" --prefix="example-" < example-schema.json + --output-dir="qapi-generated" --prefix="example-" --input-file=example-schema.json mdroth@illuin:~/w/qemu2.git$ cat qapi-generated/example-qapi-visit.c /* THIS FILE IS AUTOMATICALLY GENERATED, DO NOT MODIFY */ diff --git a/docs/qemupciserial.inf b/docs/qemupciserial.inf index 3474310a4b..6f7eef49cc 100644 --- a/docs/qemupciserial.inf +++ b/docs/qemupciserial.inf @@ -11,99 +11,92 @@ ; (Com+Lpt)" from the list. Click "Have a disk". Select this file. ; Procedure may vary a bit depending on the windows version. -; FIXME: This file covers the single port version only. +; This file covers all options: pci-serial, pci-serial-2x, pci-serial-4x +; for both 32 and 64 bit platforms. [Version] -Signature="$CHICAGO$" -Class=Ports -ClassGuid={4D36E978-E325-11CE-BFC1-08002BE10318} +Signature="$Windows NT$" +Class=MultiFunction +ClassGUID={4d36e971-e325-11ce-bfc1-08002be10318} Provider=%QEMU% -DriverVer=09/24/2012,1.3.0 - -[SourceDisksNames] -3426=windows cd - -[SourceDisksFiles] -serial.sys = 3426 -serenum.sys = 3426 - -[DestinationDirs] -DefaultDestDir = 11 ;LDID_SYS -ComPort.NT.Copy = 12 ;DIRID_DRIVERS -SerialEnumerator.NT.Copy=12 ;DIRID_DRIVERS - -; Drivers -;---------------------------------------------------------- +DriverVer=12/29/2013,1.3.0 +[ControlFlags] +ExcludeFromSelect=* [Manufacturer] -%QEMU%=QEMU,NTx86 +%QEMU%=QEMU,NTx86,NTAMD64 [QEMU.NTx86] -%QEMU-PCI_SERIAL.DeviceDesc% = ComPort, "PCI\VEN_1b36&DEV_0002&CC_0700" - -; COM sections -;---------------------------------------------------------- -[ComPort.AddReg] -HKR,,PortSubClass,1,01 - -[ComPort.NT] -AddReg=ComPort.AddReg, ComPort.NT.AddReg -LogConfig=caa -SyssetupPnPFlags = 1 - -[ComPort.NT.HW] -AddReg=ComPort.NT.HW.AddReg - -[ComPort.NT.AddReg] -HKR,,EnumPropPages32,,"MsPorts.dll,SerialPortPropPageProvider" - -[ComPort.NT.HW.AddReg] -HKR,,"UpperFilters",0x00010000,"serenum" - -;-------------- Service installation -; Port Driver (function driver for this device) -[ComPort.NT.Services] -AddService = Serial, 0x00000002, Serial_Service_Inst, Serial_EventLog_Inst -AddService = Serenum,,Serenum_Service_Inst - -; -------------- Serial Port Driver install sections -[Serial_Service_Inst] -DisplayName = %Serial.SVCDESC% -ServiceType = 1 ; SERVICE_KERNEL_DRIVER -StartType = 1 ; SERVICE_SYSTEM_START (this driver may do detection) -ErrorControl = 0 ; SERVICE_ERROR_IGNORE -ServiceBinary = %12%\serial.sys -LoadOrderGroup = Extended base - -; -------------- Serenum Driver install section -[Serenum_Service_Inst] -DisplayName = %Serenum.SVCDESC% -ServiceType = 1 ; SERVICE_KERNEL_DRIVER -StartType = 3 ; SERVICE_DEMAND_START -ErrorControl = 1 ; SERVICE_ERROR_NORMAL -ServiceBinary = %12%\serenum.sys -LoadOrderGroup = PNP Filter - -[Serial_EventLog_Inst] -AddReg = Serial_EventLog_AddReg - -[Serial_EventLog_AddReg] -HKR,,EventMessageFile,0x00020000,"%%SystemRoot%%\System32\IoLogMsg.dll;%%SystemRoot%%\System32\drivers\serial.sys" -HKR,,TypesSupported,0x00010001,7 - -; The following sections are COM port resource configs. -; Section name format means: -; Char 1 = c (COM port) -; Char 2 = I/O config: 1 (3f8), 2 (2f8), 3 (3e8), 4 (2e8), a (any) -; Char 3 = IRQ config: #, a (any) - -[caa] ; Any base, any IRQ -ConfigPriority=HARDRECONFIG -IOConfig=8@100-ffff%fff8(3ff::) -IRQConfig=S:3,4,5,7,9,10,11,12,14,15 +%QEMU-PCI_SERIAL_1_PORT%=ComPort_inst1, PCI\VEN_1B36&DEV_0002 +%QEMU-PCI_SERIAL_2_PORT%=ComPort_inst2, PCI\VEN_1B36&DEV_0003 +%QEMU-PCI_SERIAL_4_PORT%=ComPort_inst4, PCI\VEN_1B36&DEV_0004 + +[QEMU.NTAMD64] +%QEMU-PCI_SERIAL_1_PORT%=ComPort_inst1, PCI\VEN_1B36&DEV_0002 +%QEMU-PCI_SERIAL_2_PORT%=ComPort_inst2, PCI\VEN_1B36&DEV_0003 +%QEMU-PCI_SERIAL_4_PORT%=ComPort_inst4, PCI\VEN_1B36&DEV_0004 + +[ComPort_inst1] +Include=mf.inf +Needs=MFINSTALL.mf + +[ComPort_inst2] +Include=mf.inf +Needs=MFINSTALL.mf + +[ComPort_inst4] +Include=mf.inf +Needs=MFINSTALL.mf + +[ComPort_inst1.HW] +AddReg=ComPort_inst1.RegHW + +[ComPort_inst2.HW] +AddReg=ComPort_inst2.RegHW + +[ComPort_inst4.HW] +AddReg=ComPort_inst4.RegHW + +[ComPort_inst1.Services] +Include=mf.inf +Needs=MFINSTALL.mf.Services + +[ComPort_inst2.Services] +Include=mf.inf +Needs=MFINSTALL.mf.Services + +[ComPort_inst4.Services] +Include=mf.inf +Needs=MFINSTALL.mf.Services + +[ComPort_inst1.RegHW] +HKR,Child0000,HardwareID,,*PNP0501 +HKR,Child0000,VaryingResourceMap,1,00, 00,00,00,00, 08,00,00,00 +HKR,Child0000,ResourceMap,1,02 + +[ComPort_inst2.RegHW] +HKR,Child0000,HardwareID,,*PNP0501 +HKR,Child0000,VaryingResourceMap,1,00, 00,00,00,00, 08,00,00,00 +HKR,Child0000,ResourceMap,1,02 +HKR,Child0001,HardwareID,,*PNP0501 +HKR,Child0001,VaryingResourceMap,1,00, 08,00,00,00, 08,00,00,00 +HKR,Child0001,ResourceMap,1,02 + +[ComPort_inst4.RegHW] +HKR,Child0000,HardwareID,,*PNP0501 +HKR,Child0000,VaryingResourceMap,1,00, 00,00,00,00, 08,00,00,00 +HKR,Child0000,ResourceMap,1,02 +HKR,Child0001,HardwareID,,*PNP0501 +HKR,Child0001,VaryingResourceMap,1,00, 08,00,00,00, 08,00,00,00 +HKR,Child0001,ResourceMap,1,02 +HKR,Child0002,HardwareID,,*PNP0501 +HKR,Child0002,VaryingResourceMap,1,00, 10,00,00,00, 08,00,00,00 +HKR,Child0002,ResourceMap,1,02 +HKR,Child0003,HardwareID,,*PNP0501 +HKR,Child0003,VaryingResourceMap,1,00, 18,00,00,00, 08,00,00,00 +HKR,Child0003,ResourceMap,1,02 [Strings] QEMU="QEMU" -QEMU-PCI_SERIAL.DeviceDesc="QEMU Serial PCI Card" - -Serial.SVCDESC = "Serial port driver" -Serenum.SVCDESC = "Serenum Filter Driver" +QEMU-PCI_SERIAL_1_PORT="1x QEMU PCI Serial Card" +QEMU-PCI_SERIAL_2_PORT="2x QEMU PCI Serial Card" +QEMU-PCI_SERIAL_4_PORT="4x QEMU PCI Serial Card" diff --git a/docs/qmp/README b/docs/qmp/README new file mode 100644 index 0000000000..f6a3a031e9 --- /dev/null +++ b/docs/qmp/README @@ -0,0 +1,87 @@ + QEMU Machine Protocol + ===================== + +Introduction +------------ + +The QEMU Machine Protocol (QMP) allows applications to operate a +QEMU instance. + +QMP is JSON[1] based and features the following: + +- Lightweight, text-based, easy to parse data format +- Asynchronous messages support (ie. events) +- Capabilities Negotiation + +For detailed information on QMP's usage, please, refer to the following files: + +o qmp-spec.txt QEMU Machine Protocol current specification +o qmp-commands.txt QMP supported commands (auto-generated at build-time) +o qmp-events.txt List of available asynchronous events + +[1] http://www.json.org + +Usage +----- + +You can use the -qmp option to enable QMP. For example, the following +makes QMP available on localhost port 4444: + +$ qemu [...] -qmp tcp:localhost:4444,server,nowait + +However, for more flexibility and to make use of more options, the -mon +command-line option should be used. For instance, the following example +creates one HMP instance (human monitor) on stdio and one QMP instance +on localhost port 4444: + +$ qemu [...] -chardev stdio,id=mon0 -mon chardev=mon0,mode=readline \ + -chardev socket,id=mon1,host=localhost,port=4444,server,nowait \ + -mon chardev=mon1,mode=control,pretty=on + +Please, refer to QEMU's manpage for more information. + +Simple Testing +-------------- + +To manually test QMP one can connect with telnet and issue commands by hand: + +$ telnet localhost 4444 +Trying 127.0.0.1... +Connected to localhost. +Escape character is '^]'. +{ + "QMP": { + "version": { + "qemu": { + "micro": 50, + "minor": 6, + "major": 1 + }, + "package": "" + }, + "capabilities": [ + ] + } +} + +{ "execute": "qmp_capabilities" } +{ + "return": { + } +} + +{ "execute": "query-status" } +{ + "return": { + "status": "prelaunch", + "singlestep": false, + "running": false + } +} + +Please, refer to the qapi-schema.json file for a complete command reference. + +QMP wiki page +------------- + +http://wiki.qemu-project.org/QMP diff --git a/QMP/qmp-events.txt b/docs/qmp/qmp-events.txt similarity index 86% rename from QMP/qmp-events.txt rename to docs/qmp/qmp-events.txt index 39b6016460..145402e078 100644 --- a/QMP/qmp-events.txt +++ b/docs/qmp/qmp-events.txt @@ -1,4 +1,4 @@ - QEMU Monitor Protocol Events + QEMU Machine Protocol Events ============================ BALLOON_CHANGE @@ -18,6 +18,28 @@ Example: "data": { "actual": 944766976 }, "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } +BLOCK_IMAGE_CORRUPTED +--------------------- + +Emitted when a disk image is being marked corrupt. + +Data: + +- "device": Device name (json-string) +- "msg": Informative message (e.g., reason for the corruption) (json-string) +- "offset": If the corruption resulted from an image access, this is the access + offset into the image (json-int) +- "size": If the corruption resulted from an image access, this is the access + size (json-int) + +Example: + +{ "event": "BLOCK_IMAGE_CORRUPTED", + "data": { "device": "ide0-hd0", + "msg": "Prevented active L1 table overwrite", "offset": 196608, + "size": 65536 }, + "timestamp": { "seconds": 1378126126, "microseconds": 966463 } } + BLOCK_IO_ERROR -------------- @@ -137,7 +159,7 @@ Note: The "ready to complete" status is always reset by a BLOCK_JOB_ERROR event. DEVICE_DELETED ------------------ +-------------- Emitted whenever the device removal completion is acknowledged by the guest. @@ -172,8 +194,22 @@ Data: }, "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } +GUEST_PANICKED +-------------- + +Emitted when guest OS panic is detected. + +Data: + +- "action": Action that has been taken (json-string, currently always "pause"). + +Example: + +{ "event": "GUEST_PANICKED", + "data": { "action": "pause" } } + NIC_RX_FILTER_CHANGED ------------------ +--------------------- The event is emitted once until the query command is executed, the first event will always be emitted. @@ -189,6 +225,45 @@ Data: "timestamp": { "seconds": 1368697518, "microseconds": 326866 } } } +QUORUM_FAILURE +-------------- + +Emitted by the Quorum block driver if it fails to establish a quorum. + +Data: + +- "reference": device name if defined else node name. +- "sector-num": Number of the first sector of the failed read operation. +- "sector-count": Failed read operation sector count. + +Example: + +{ "event": "QUORUM_FAILURE", + "data": { "reference": "usr1", "sector-num": 345435, "sector-count": 5 }, + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + +QUORUM_REPORT_BAD +----------------- + +Emitted to report a corruption of a Quorum file. + +Data: + +- "error": Error message (json-string, optional) + Only present on failure. This field contains a human-readable + error message. There are no semantics other than that the + block layer reported an error and clients should not try to + interpret the error string. +- "node-name": The graph node name of the block driver state. +- "sector-num": Number of the first sector of the failed read operation. +- "sector-count": Failed read operation sector count. + +Example: + +{ "event": "QUORUM_REPORT_BAD", + "data": { "node-name": "1.raw", "sector-num": 345435, "sector-count": 5 }, + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + RESET ----- @@ -443,7 +518,7 @@ Data: None. Example: -{ "event": "WATCHDOG", +{ "event": "WAKEUP", "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } WATCHDOG @@ -464,17 +539,3 @@ Example: Note: If action is "reset", "shutdown", or "pause" the WATCHDOG event is followed respectively by the RESET, SHUTDOWN, or STOP events. - -GUEST_PANICKED --------------- - -Emitted when guest OS panic is detected. - -Data: - -- "action": Action that has been taken (json-string, currently always "pause"). - -Example: - -{ "event": "GUEST_PANICKED", - "data": { "action": "pause" } } diff --git a/QMP/qmp-spec.txt b/docs/qmp/qmp-spec.txt similarity index 82% rename from QMP/qmp-spec.txt rename to docs/qmp/qmp-spec.txt index a27789692b..22568c644e 100644 --- a/QMP/qmp-spec.txt +++ b/docs/qmp/qmp-spec.txt @@ -1,21 +1,17 @@ - QEMU Monitor Protocol Specification - Version 0.1 + QEMU Machine Protocol Specification 1. Introduction =============== -This document specifies the QEMU Monitor Protocol (QMP), a JSON-based protocol -which is available for applications to control QEMU at the machine-level. - -To enable QMP support, QEMU has to be run in "control mode". This is done by -starting QEMU with the appropriate command-line options. Please, refer to the -QEMU manual page for more information. +This document specifies the QEMU Machine Protocol (QMP), a JSON-based protocol +which is available for applications to operate QEMU at the machine-level. 2. Protocol Specification ========================= This section details the protocol format. For the purpose of this document -"Client" is any application which is communicating with QEMU in control mode, -and "Server" is QEMU itself. +"Client" is any application which is using QMP to communicate with QEMU and +"Server" is QEMU itself. JSON data structures, when mentioned in this document, are always in the following format: @@ -47,14 +43,14 @@ that the connection has been successfully established and that the Server is ready for capabilities negotiation (for more information refer to section '4. Capabilities Negotiation'). -The format is: +The greeting message format is: { "QMP": { "version": json-object, "capabilities": json-array } } Where, - The "version" member contains the Server's version information (the format - is the same of the 'query-version' command) + is the same of the query-version command) - The "capabilities" member specify the availability of features beyond the baseline specification @@ -83,10 +79,7 @@ of a command execution: success or error. 2.4.1 success ------------- -The success response is issued when the command execution has finished -without errors. - -The format is: +The format of a success response is: { "return": json-object, "id": json-value } @@ -96,15 +89,12 @@ The format is: in a per-command basis or an empty json-object if the command does not return data - The "id" member contains the transaction identification associated - with the command execution (if issued by the Client) + with the command execution if issued by the Client 2.4.2 error ----------- -The error response is issued when the command execution could not be -completed because of an error condition. - -The format is: +The format of an error response is: { "error": { "class": json-string, "desc": json-string }, "id": json-value } @@ -114,7 +104,7 @@ The format is: - The "desc" member is a human-readable error message. Clients should not attempt to parse this message. - The "id" member contains the transaction identification associated with - the command execution (if issued by the Client) + the command execution if issued by the Client NOTE: Some errors can occur before the Server is able to read the "id" member, in these cases the "id" member will not be part of the error response, even @@ -124,9 +114,9 @@ if provided by the client. ----------------------- As a result of state changes, the Server may send messages unilaterally -to the Client at any time. They are called 'asynchronous events'. +to the Client at any time. They are called "asynchronous events". -The format is: +The format of asynchronous events is: { "event": json-string, "data": json-object, "timestamp": { "seconds": json-number, "microseconds": json-number } } @@ -147,36 +137,37 @@ qmp-events.txt file. =============== This section provides some examples of real QMP usage, in all of them -'C' stands for 'Client' and 'S' stands for 'Server'. +"C" stands for "Client" and "S" stands for "Server". 3.1 Server greeting ------------------- -S: {"QMP": {"version": {"qemu": "0.12.50", "package": ""}, "capabilities": []}} +S: { "QMP": { "version": { "qemu": { "micro": 50, "minor": 6, "major": 1 }, + "package": ""}, "capabilities": []}} 3.2 Simple 'stop' execution --------------------------- C: { "execute": "stop" } -S: {"return": {}} +S: { "return": {} } 3.3 KVM information ------------------- C: { "execute": "query-kvm", "id": "example" } -S: {"return": {"enabled": true, "present": true}, "id": "example"} +S: { "return": { "enabled": true, "present": true }, "id": "example"} 3.4 Parsing error ------------------ C: { "execute": } -S: {"error": {"class": "GenericError", "desc": "Invalid JSON syntax" } } +S: { "error": { "class": "GenericError", "desc": "Invalid JSON syntax" } } 3.5 Powerdown event ------------------- -S: {"timestamp": {"seconds": 1258551470, "microseconds": 802384}, "event": -"POWERDOWN"} +S: { "timestamp": { "seconds": 1258551470, "microseconds": 802384 }, + "event": "POWERDOWN" } 4. Capabilities Negotiation ---------------------------- @@ -184,17 +175,17 @@ S: {"timestamp": {"seconds": 1258551470, "microseconds": 802384}, "event": When a Client successfully establishes a connection, the Server is in Capabilities Negotiation mode. -In this mode only the 'qmp_capabilities' command is allowed to run, all -other commands will return the CommandNotFound error. Asynchronous messages -are not delivered either. +In this mode only the qmp_capabilities command is allowed to run, all +other commands will return the CommandNotFound error. Asynchronous +messages are not delivered either. -Clients should use the 'qmp_capabilities' command to enable capabilities +Clients should use the qmp_capabilities command to enable capabilities advertised in the Server's greeting (section '2.2 Server Greeting') they support. -When the 'qmp_capabilities' command is issued, and if it does not return an +When the qmp_capabilities command is issued, and if it does not return an error, the Server enters in Command mode where capabilities changes take -effect, all commands (except 'qmp_capabilities') are allowed and asynchronous +effect, all commands (except qmp_capabilities) are allowed and asynchronous messages are delivered. 5 Compatibility Considerations @@ -245,7 +236,7 @@ arguments, errors, asynchronous events, and so forth. Any new names downstream wishes to add must begin with '__'. To ensure compatibility with other downstreams, it is strongly -recommended that you prefix your downstram names with '__RFQDN_' where +recommended that you prefix your downstream names with '__RFQDN_' where RFQDN is a valid, reverse fully qualified domain name which you control. For example, a qemu-kvm specific monitor command would be: diff --git a/docs/rdma.txt b/docs/rdma.txt index 8d1e003f92..1f5d9e9fe4 100644 --- a/docs/rdma.txt +++ b/docs/rdma.txt @@ -1,7 +1,7 @@ (RDMA: Remote Direct Memory Access) RDMA Live Migration Specification, Version # 1 ============================================== -Wiki: http://wiki.qemu.org/Features/RDMALiveMigration +Wiki: http://wiki.qemu-project.org/Features/RDMALiveMigration Github: git@github.com:hinesmr/qemu.git, 'rdma' branch Copyright (C) 2013 Michael R. Hines @@ -66,7 +66,7 @@ bulk-phase round of the migration and can be enabled for extremely high-performance RDMA hardware using the following command: QEMU Monitor Command: -$ migrate_set_capability x-rdma-pin-all on # disabled by default +$ migrate_set_capability rdma-pin-all on # disabled by default Performing this action will cause all 8GB to be pinned, so if that's not what you want, then please ignore this step altogether. @@ -93,12 +93,12 @@ $ migrate_set_speed 40g # or whatever is the MAX of your RDMA device Next, on the destination machine, add the following to the QEMU command line: -qemu ..... -incoming x-rdma:host:port +qemu ..... -incoming rdma:host:port Finally, perform the actual migration on the source machine: QEMU Monitor Command: -$ migrate -d x-rdma:host:port +$ migrate -d rdma:host:port PERFORMANCE =========== @@ -120,8 +120,8 @@ For example, in the same 8GB RAM example with all 8GB of memory in active use and the VM itself is completely idle using the same 40 gbps infiniband link: -1. x-rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps -2. x-rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps +1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps +2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps These numbers would of course scale up to whatever size virtual machine you have to migrate using RDMA. @@ -407,18 +407,14 @@ socket is broken during a non-RDMA based migration. TODO: ===== -1. 'migrate x-rdma:host:port' and '-incoming x-rdma' options will be - renamed to 'rdma' after the experimental phase of this work has - completed upstream. -2. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits +1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits are not compatible with infinband memory pinning and will result in an aborted migration (but with the source VM left unaffected). -3. Use of the recent /proc//pagemap would likely speed up +2. Use of the recent /proc//pagemap would likely speed up the use of KSM and ballooning while using RDMA. -4. Also, some form of balloon-device usage tracking would also +3. Also, some form of balloon-device usage tracking would also help alleviate some issues. -5. Move UNREGISTER requests to a separate thread. -6. Use LRU to provide more fine-grained direction of UNREGISTER +4. Use LRU to provide more fine-grained direction of UNREGISTER requests for unpinning memory in an overcommitted environment. -7. Expose UNREGISTER support to the user by way of workload-specific +5. Expose UNREGISTER support to the user by way of workload-specific hints about application behavior. diff --git a/docs/specs/acpi_cpu_hotplug.txt b/docs/specs/acpi_cpu_hotplug.txt index 5dec0c5010..340b751a95 100644 --- a/docs/specs/acpi_cpu_hotplug.txt +++ b/docs/specs/acpi_cpu_hotplug.txt @@ -10,7 +10,9 @@ ACPI GPE block (IO ports 0xafe0-0xafe3, byte access): Generic ACPI GPE block. Bit 2 (GPE.2) used to notify CPU hot-add/remove event to ACPI BIOS, via SCI interrupt. -CPU present bitmap (IO port 0xaf00-0xae1f, 1-byte access): +CPU present bitmap for: + ICH9-LPC (IO port 0x0cd8-0xcf7, 1-byte access) + PIIX-PM (IO port 0xaf00-0xaf1f, 1-byte access) --------------------------------------------------------------- One bit per CPU. Bit position reflects corresponding CPU APIC ID. Read-only. diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt index 36a559d886..f19536a46f 100644 --- a/docs/specs/qcow2.txt +++ b/docs/specs/qcow2.txt @@ -80,7 +80,12 @@ in the description of a field. tables to repair refcounts before accessing the image. - Bits 1-63: Reserved (set to 0) + Bit 1: Corrupt bit. If this bit is set then any data + structure may be corrupt and the image must not + be written to (unless for regaining + consistency). + + Bits 2-63: Reserved (set to 0) 80 - 87: compatible_features Bitmask of compatible features. An implementation can @@ -350,3 +355,6 @@ Snapshot table entry: variable: Unique ID string for the snapshot (not null terminated) variable: Name of the snapshot (not null terminated) + + variable: Padding to round up the snapshot table entry size to the + next multiple of 8. diff --git a/docs/specs/standard-vga.txt b/docs/specs/standard-vga.txt index 8a4c1e93cd..f82773e677 100644 --- a/docs/specs/standard-vga.txt +++ b/docs/specs/standard-vga.txt @@ -5,9 +5,10 @@ QEMU Standard VGA Exists in two variants, for isa and pci. command line switches: - -vga std [ picks isa for -M isapc, otherwise pci ] - -device VGA [ pci variant ] - -device isa-vga [ isa variant ] + -vga std [ picks isa for -M isapc, otherwise pci ] + -device VGA [ pci variant ] + -device isa-vga [ isa variant ] + -device secondary-vga [ legacy-free pci variant ] PCI spec @@ -31,9 +32,15 @@ PCI ROM Region: Holds the vgabios (qemu 0.14+). +The legacy-free variant has no ROM and has PCI_CLASS_DISPLAY_OTHER +instead of PCI_CLASS_DISPLAY_VGA. + + IO ports used ------------- +Doesn't apply to the legacy-free pci variant, use the MMIO bar instead. + 03c0 - 03df : standard vga ports 01ce : bochs vbe interface index port 01cf : bochs vbe interface data port (x86 only) diff --git a/docs/specs/vhost-user.txt b/docs/specs/vhost-user.txt new file mode 100644 index 0000000000..e7f43f51da --- /dev/null +++ b/docs/specs/vhost-user.txt @@ -0,0 +1,261 @@ +Vhost-user Protocol +=================== + +This protocol is aiming to complement the ioctl interface used to control the +vhost implementation in the Linux kernel. It implements the control plane needed +to establish virtqueue sharing with a user space process on the same host. It +uses communication over a Unix domain socket to share file descriptors in the +ancillary data of the message. + +The protocol defines 2 sides of the communication, master and slave. Master is +the application that shares it's virtqueues, in our case QEMU. Slave is the +consumer of the virtqueues. + +In the current implementation QEMU is the Master, and the Slave is intended to +be a software ethernet switch running in user space, such as Snabbswitch. + +Master and slave can be either a client (i.e. connecting) or server (listening) +in the socket communication. + +Message Specification +--------------------- + +Note that all numbers are in the machine native byte order. A vhost-user message +consists of 3 header fields and a payload: + +------------------------------------ +| request | flags | size | payload | +------------------------------------ + + * Request: 32-bit type of the request + * Flags: 32-bit bit field: + - Lower 2 bits are the version (currently 0x01) + - Bit 2 is the reply flag - needs to be sent on each reply from the slave + * Size - 32-bit size of the payload + + +Depending on the request type, payload can be: + + * A single 64-bit integer + ------- + | u64 | + ------- + + u64: a 64-bit unsigned integer + + * A vring state description + --------------- + | index | num | + --------------- + + Index: a 32-bit index + Num: a 32-bit number + + * A vring address description + -------------------------------------------------------------- + | index | flags | size | descriptor | used | available | log | + -------------------------------------------------------------- + + Index: a 32-bit vring index + Flags: a 32-bit vring flags + Descriptor: a 64-bit user address of the vring descriptor table + Used: a 64-bit user address of the vring used ring + Available: a 64-bit user address of the vring available ring + Log: a 64-bit guest address for logging + + * Memory regions description + --------------------------------------------------- + | num regions | padding | region0 | ... | region7 | + --------------------------------------------------- + + Num regions: a 32-bit number of regions + Padding: 32-bit + + A region is: + --------------------------------------- + | guest address | size | user address | + --------------------------------------- + + Guest address: a 64-bit guest address of the region + Size: a 64-bit size + User address: a 64-bit user address + + +In QEMU the vhost-user message is implemented with the following struct: + +typedef struct VhostUserMsg { + VhostUserRequest request; + uint32_t flags; + uint32_t size; + union { + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + }; +} QEMU_PACKED VhostUserMsg; + +Communication +------------- + +The protocol for vhost-user is based on the existing implementation of vhost +for the Linux Kernel. Most messages that can be send via the Unix domain socket +implementing vhost-user have an equivalent ioctl to the kernel implementation. + +The communication consists of master sending message requests and slave sending +message replies. Most of the requests don't require replies. Here is a list of +the ones that do: + + * VHOST_GET_FEATURES + * VHOST_GET_VRING_BASE + +There are several messages that the master sends with file descriptors passed +in the ancillary data: + + * VHOST_SET_MEM_TABLE + * VHOST_SET_LOG_FD + * VHOST_SET_VRING_KICK + * VHOST_SET_VRING_CALL + * VHOST_SET_VRING_ERR + +If Master is unable to send the full message or receives a wrong reply it will +close the connection. An optional reconnection mechanism can be implemented. + +Message types +------------- + + * VHOST_USER_GET_FEATURES + + Id: 2 + Equivalent ioctl: VHOST_GET_FEATURES + Master payload: N/A + Slave payload: u64 + + Get from the underlying vhost implementation the features bitmask. + + * VHOST_USER_SET_FEATURES + + Id: 3 + Ioctl: VHOST_SET_FEATURES + Master payload: u64 + + Enable features in the underlying vhost implementation using a bitmask. + + * VHOST_USER_SET_OWNER + + Id: 4 + Equivalent ioctl: VHOST_SET_OWNER + Master payload: N/A + + Issued when a new connection is established. It sets the current Master + as an owner of the session. This can be used on the Slave as a + "session start" flag. + + * VHOST_USER_RESET_OWNER + + Id: 5 + Equivalent ioctl: VHOST_RESET_OWNER + Master payload: N/A + + Issued when a new connection is about to be closed. The Master will no + longer own this connection (and will usually close it). + + * VHOST_USER_SET_MEM_TABLE + + Id: 6 + Equivalent ioctl: VHOST_SET_MEM_TABLE + Master payload: memory regions description + + Sets the memory map regions on the slave so it can translate the vring + addresses. In the ancillary data there is an array of file descriptors + for each memory mapped region. The size and ordering of the fds matches + the number and ordering of memory regions. + + * VHOST_USER_SET_LOG_BASE + + Id: 7 + Equivalent ioctl: VHOST_SET_LOG_BASE + Master payload: u64 + + Sets the logging base address. + + * VHOST_USER_SET_LOG_FD + + Id: 8 + Equivalent ioctl: VHOST_SET_LOG_FD + Master payload: N/A + + Sets the logging file descriptor, which is passed as ancillary data. + + * VHOST_USER_SET_VRING_NUM + + Id: 9 + Equivalent ioctl: VHOST_SET_VRING_NUM + Master payload: vring state description + + Sets the number of vrings for this owner. + + * VHOST_USER_SET_VRING_ADDR + + Id: 10 + Equivalent ioctl: VHOST_SET_VRING_ADDR + Master payload: vring address description + Slave payload: N/A + + Sets the addresses of the different aspects of the vring. + + * VHOST_USER_SET_VRING_BASE + + Id: 11 + Equivalent ioctl: VHOST_SET_VRING_BASE + Master payload: vring state description + + Sets the base offset in the available vring. + + * VHOST_USER_GET_VRING_BASE + + Id: 12 + Equivalent ioctl: VHOST_USER_GET_VRING_BASE + Master payload: vring state description + Slave payload: vring state description + + Get the available vring base offset. + + * VHOST_USER_SET_VRING_KICK + + Id: 13 + Equivalent ioctl: VHOST_SET_VRING_KICK + Master payload: u64 + + Set the event file descriptor for adding buffers to the vring. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling should be used + instead of waiting for a kick. + + * VHOST_USER_SET_VRING_CALL + + Id: 14 + Equivalent ioctl: VHOST_SET_VRING_CALL + Master payload: u64 + + Set the event file descriptor to signal when buffers are used. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling will be used + instead of waiting for the call. + + * VHOST_USER_SET_VRING_ERR + + Id: 15 + Equivalent ioctl: VHOST_SET_VRING_ERR + Master payload: u64 + + Set the event file descriptor to signal when error occurs. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. + diff --git a/docs/tracing.txt b/docs/tracing.txt index bfc261bcaf..bf2e15ce30 100644 --- a/docs/tracing.txt +++ b/docs/tracing.txt @@ -214,6 +214,42 @@ The "ust" backend uses the LTTng Userspace Tracer library. There are no monitor commands built into QEMU, instead UST utilities should be used to list, enable/disable, and dump traces. +Package lttng-tools is required for userspace tracing. You must ensure that the +current user belongs to the "tracing" group, or manually launch the +lttng-sessiond daemon for the current user prior to running any instance of +QEMU. + +While running an instrumented QEMU, LTTng should be able to list all available +events: + + lttng list -u + +Create tracing session: + + lttng create mysession + +Enable events: + + lttng enable-event qemu:g_malloc -u + +Where the events can either be a comma-separated list of events, or "-a" to +enable all tracepoint events. Start and stop tracing as needed: + + lttng start + lttng stop + +View the trace: + + lttng view + +Destroy tracing session: + + lttng destroy + +Babeltrace can be used at any later time to view the trace: + + babeltrace $HOME/lttng-traces/mysession--