[PATCH] Open metrics disk with O_DIRECT.
In the guest, we really need to open the metrics disk with the O_DIRECT option. Otherwise the guest kernel will cache arbitrary parts of the disk, and won't notice when the host updates it. Now the problem with O_DIRECT is it introduces undetermined restrictions on how you can read from the underlying device. In particular, all read(2) calls must be made with: (a) Buffer must be aligned to a block size. (b) Current file offset must be aligned to a block size. (c) Requested read size must be a multiple of block size. That's quite difficult, but there's a further restriction: There's no reliable way to know what the "block size" is! This patch changes the reads to use O_DIRECT and satisfies the alignment restrictions above, except that we choose an arbitrary block size (64K) which is hopefully large enough to cover any real cases. (Note that this also implies that the minimum size of the metrics disk will be 64K -- currently the default is 256K so we're OK unless a sysadmin reduces the default). Rich. -- Richard Jones, Emerging Technologies, Red Hat http://et.redhat.com/~rjones virt-df lists disk usage of guests without needing to install any software inside the virtual machine. Supports Linux and Windows. http://et.redhat.com/~rjones/virt-df/
Richard W.M. Jones wrote:
In the guest, we really need to open the metrics disk with the O_DIRECT option. Otherwise the guest kernel will cache arbitrary parts of the disk, and won't notice when the host updates it.
Now the problem with O_DIRECT is it introduces undetermined restrictions on how you can read from the underlying device. In particular, all read(2) calls must be made with:
(a) Buffer must be aligned to a block size. (b) Current file offset must be aligned to a block size. (c) Requested read size must be a multiple of block size.
That's quite difficult, but there's a further restriction: There's no reliable way to know what the "block size" is!
This patch changes the reads to use O_DIRECT and satisfies the alignment restrictions above, except that we choose an arbitrary block size (64K) which is hopefully large enough to cover any real cases.
(Note that this also implies that the minimum size of the metrics disk will be 64K -- currently the default is 256K so we're OK unless a sysadmin reduces the default).
From f22c9b50d12f08bb11f1f549ddd2d01e1be6118b Mon Sep 17 00:00:00 2001 From: Richard Jones <rjones@trick.home.annexia.org> Date: Wed, 7 Oct 2009 11:03:23 +0100 Subject: [PATCH 3/4] Open metrics disk with O_DIRECT.
Opening with O_DIRECT avoids having the guest kernel cache old copies of the metrics data. --- libmetrics/libmetrics.c | 101 +++++++++++++++++++++++++++++++++------------- 1 files changed, 72 insertions(+), 29 deletions(-)
diff --git a/libmetrics/libmetrics.c b/libmetrics/libmetrics.c index 3909687..6764c3f 100644 --- a/libmetrics/libmetrics.c +++ b/libmetrics/libmetrics.c @@ -24,6 +24,7 @@
#include <stdio.h> #include <string.h> +#include <fcntl.h> #include <unistd.h> #include <errno.h> #include <dirent.h> @@ -278,6 +279,45 @@ out: return ret; }
+/* Read from an O_DIRECT device. You can't do arbitrary reads on + * such devices. You can only read block-aligned block-sized + * chunks of data into block-aligned memory. + * + * This returns 'size' bytes in 'buf', read from 'offset' in the + * file 'fd'. + */ + +/* There's no way to determine this, so just choose a large + * block size. + */ +#define BLOCK_SIZE 65536 + +static int +odirect_read (int fd, void *buf, size_t offset, size_t size)
vhostmd doesn't have a HACKERS or coding style doc, but the current style does not include a space between function name and parameter list. A small nit but ACK. Thanks Rich!
+{ + if (lseek (fd, 0, SEEK_SET) == -1) + return -1; + + size_t n = ((offset + size) + BLOCK_SIZE - 1) & ~(BLOCK_SIZE - 1); + void *mem; + int r; + r = posix_memalign (&mem, BLOCK_SIZE, n); + if (r != 0) { + errno = r; + return -1; + } + + if (read (fd, mem, n) != n) { + free (mem); + return -1; + } + + memcpy (buf, (char *) mem + offset, size); + + free (mem); + return 0; +} + /* * Read metrics disk and populate mdisk * Location of metrics disk is derived by looking at all block @@ -289,7 +329,7 @@ static int read_mdisk(metric_disk *mdisk) mdisk_header md_header; uint32_t busy; uint32_t sig; - FILE *fp; + int fd; char *path;
DIR* dir; @@ -310,51 +350,54 @@ retry: #else path = strdup("/dev/shm/vhostmd0"); #endif - fp = fopen(path, "r"); - if (fp == NULL) { - free(path); + /* Open with O_DIRECT to avoid kernel keeping old copies around + * in the cache. + */ + fd = open (path, O_RDONLY|O_DIRECT); + if (fd == -1) { + free (path); continue; } - if (fread(&md_header, 1, sizeof(md_header), fp) != sizeof(md_header)) { - free(path); - fclose(fp); + if (odirect_read (fd, &md_header, 0, sizeof md_header) == -1) { + free (path); + close (fd); continue; }
if ((sig = ntohl(md_header.sig)) == MDISK_SIGNATURE) { busy = ntohl(md_header.busy); if (busy) { - fclose(fp); - free(path); - sleep(1); - goto retry; + close(fd); + free(path); + sleep(1); + goto retry; } mdisk->sum = ntohl(md_header.sum); mdisk->length = ntohl(md_header.length); mdisk->buffer = malloc(mdisk->length); mdisk->disk_name = strdup(path); - fread(mdisk->buffer, 1, mdisk->length, fp); - free(path); + /* XXX check return value */ + odirect_read (fd, mdisk->buffer, sizeof md_header, mdisk->length); + free(path);
/* Verify data still valid */ - fseek(fp, 0, SEEK_SET); - if (fread(&md_header, 1, sizeof(md_header), fp) != sizeof(md_header)) { - mdisk_content_free(); - fclose(fp); + if (odirect_read (fd, &md_header, 0, sizeof md_header) == -1) { + mdisk_content_free(); + close (fd); sleep(1); goto retry; } busy = ntohl(md_header.busy); if (busy || mdisk->sum != ntohl(md_header.sum)) { mdisk_content_free(); - fclose(fp); + close (fd); sleep(1); goto retry; } - fclose(fp); - break; + close (fd); + break; } - fclose(fp); + close (fd); }
if (mdisk->buffer == NULL) @@ -391,20 +434,20 @@ static uint32_t read_mdisk_sum(metric_disk *mdisk) { mdisk_header md_header; uint32_t sum = 0; - FILE *fp; + int fd;
if (mdisk == NULL || mdisk->disk_name == NULL) return 0;
- fp = fopen(mdisk->disk_name, "r"); - if (fp == NULL) - return 0; + fd = open(mdisk->disk_name, O_RDONLY|O_DIRECT); + if (fd == -1) + return 0;
- if (fread(&md_header, 1, sizeof(md_header), fp) != sizeof(md_header)) { - fclose(fp); - return 0; + if (odirect_read (fd, &md_header, 0, sizeof md_header) == -1) { + close(fd); + return 0; } - fclose(fp); + close (fd);
if (ntohl(md_header.sig) == MDISK_SIGNATURE) { if (ntohl(md_header.busy)) { -- 1.6.2.5 -- To unsubscribe, e-mail: vhostmd+unsubscribe@opensuse.org For additional commands, e-mail: vhostmd+help@opensuse.org
participants (2)
-
Jim Fehlig
-
Richard W.M. Jones