Discussion:
[libvirt] [PATCH v2] qemu: Introduce caching whether /dev/kvm is accessible
Marc Hartmayer
2018-10-29 17:34:58 UTC
Permalink
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.

Suggested-by: Daniel P. Berrangé <***@redhat.com>
Signed-off-by: Marc Hartmayer <***@linux.ibm.com>
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}


+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
+
+ if (kvm_ctime != cached_kvm_ctime) {
+ VIR_DEBUG("%s has changed (%lld vs %lld)", kvm_device,
+ (long long)kvm_ctime, (long long)cached_kvm_ctime);
+ cached_value = VIR_TRISTATE_BOOL_ABSENT;
+ }
+
+ if (cached_value != VIR_TRISTATE_BOOL_ABSENT)
+ return cached_value == VIR_TRISTATE_BOOL_YES;
+
+ if (virFileAccessibleAs(kvm_device, R_OK | W_OK,
+ priv->runUid, priv->runGid) == 0) {
+ value = VIR_TRISTATE_BOOL_YES;
+ } else {
+ value = VIR_TRISTATE_BOOL_NO;
+ }
+
+ /* There is a race window between 'stat' and
+ * 'virFileAccessibleAs'. However, since we're only interested in
+ * detecting changes *after* the virFileAccessibleAs check, we can
+ * neglect this here.
+ */
+ priv->kvmCtime = kvm_ctime;
+ priv->kvmUsable = value;
+
+ return value == VIR_TRISTATE_BOOL_YES;
+}
+
+
static bool
virQEMUCapsIsValid(void *data,
void *privData)
@@ -3872,8 +3922,7 @@ virQEMUCapsIsValid(void *data,
return true;
}

- kvmUsable = virFileAccessibleAs("/dev/kvm", R_OK | W_OK,
- priv->runUid, priv->runGid) == 0;
+ kvmUsable = virQEMUCapsKVMUsable(priv);

if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_KVM) &&
kvmUsable) {
@@ -4684,6 +4733,7 @@ virQEMUCapsCacheNew(const char *libDir,
priv->runUid = runUid;
priv->runGid = runGid;
priv->microcodeVersion = microcodeVersion;
+ priv->kvmUsable = VIR_TRISTATE_BOOL_ABSENT;

if (uname(&uts) == 0 &&
virAsprintf(&priv->kernelVersion, "%s %s", uts.release, uts.version) < 0)
--
2.17.0
Michal Privoznik
2018-10-30 08:13:50 UTC
Permalink
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Post by Marc Hartmayer
+
+ if (kvm_ctime != cached_kvm_ctime) {
+ VIR_DEBUG("%s has changed (%lld vs %lld)", kvm_device,
+ (long long)kvm_ctime, (long long)cached_kvm_ctime);
+ cached_value = VIR_TRISTATE_BOOL_ABSENT;
+ }
+
+ if (cached_value != VIR_TRISTATE_BOOL_ABSENT)
+ return cached_value == VIR_TRISTATE_BOOL_YES;
+
+ if (virFileAccessibleAs(kvm_device, R_OK | W_OK,
+ priv->runUid, priv->runGid) == 0) {
+ value = VIR_TRISTATE_BOOL_YES;
+ } else {
+ value = VIR_TRISTATE_BOOL_NO;
+ }
+
+ /* There is a race window between 'stat' and
+ * 'virFileAccessibleAs'. However, since we're only interested in
+ * detecting changes *after* the virFileAccessibleAs check, we can
+ * neglect this here.
+ */
+ priv->kvmCtime = kvm_ctime;
+ priv->kvmUsable = value;
+
+ return value == VIR_TRISTATE_BOOL_YES;
+}
+
+
static bool
virQEMUCapsIsValid(void *data,
void *privData)
@@ -3872,8 +3922,7 @@ virQEMUCapsIsValid(void *data,
return true;
}
- kvmUsable = virFileAccessibleAs("/dev/kvm", R_OK | W_OK,
- priv->runUid, priv->runGid) == 0;
+ kvmUsable = virQEMUCapsKVMUsable(priv);
if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_KVM) &&
kvmUsable) {
@@ -4684,6 +4733,7 @@ virQEMUCapsCacheNew(const char *libDir,
priv->runUid = runUid;
priv->runGid = runGid;
priv->microcodeVersion = microcodeVersion;
+ priv->kvmUsable = VIR_TRISTATE_BOOL_ABSENT;
if (uname(&uts) == 0 &&
virAsprintf(&priv->kernelVersion, "%s %s", uts.release, uts.version) < 0)
Otherwise looking good.

Michal
Daniel P. Berrangé
2018-10-30 09:35:33 UTC
Permalink
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(

We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.

Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Michal Privoznik
2018-10-30 10:08:45 UTC
Permalink
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.

Michal
Daniel P. Berrangé
2018-10-30 10:32:08 UTC
Permalink
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.


Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Daniel P. Berrangé
2018-10-30 12:55:50 UTC
Permalink
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.

eg I vaguely recall a udev rule that resets permissions on device nodes
whenever an FD is closed, which might cause this kind of behaviour

Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Michal Privoznik
2018-10-30 13:46:02 UTC
Permalink
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
It is kernel problem. In my testing, the moment I call:

ioctl(kvm, KVM_CREATE_VM, 0);

all three times (atime, mtime, ctime) are changed. However, prior to
that I'm calling some 'query' ioctls (KVM_GET_API_VERSION,
KVM_CHECK_EXTENSION) but none of the Xtime is changed.
Post by Daniel P. Berrangé
eg I vaguely recall a udev rule that resets permissions on device nodes
whenever an FD is closed, which might cause this kind of behaviour
Huh, I did not know that udev can hook on close(2).

Michal
Michal Privoznik
2018-10-30 14:45:36 UTC
Permalink
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
ioctl(kvm, KVM_CREATE_VM, 0);
Okay, I have to retract this claim. 'udevadm monitor' shows some events:

KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)

and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?

Michal
Daniel P. Berrangé
2018-10-30 15:07:59 UTC
Permalink
Post by Michal Privoznik
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
ioctl(kvm, KVM_CREATE_VM, 0);
KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)
and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(

Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Michal Privoznik
2018-10-30 15:47:32 UTC
Permalink
Post by Daniel P. Berrangé
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Well, caching owner + seclabels + ACLs won't help either. What if user
loads some profile into AppArmor or something that denies previously
allowed access to /dev/kvm (or vice versa)? What I am saying is that
there are some security models which base their decisions on something
else than file attributes.

Honestly, I don't have solution.

Michal
Daniel P. Berrangé
2018-10-30 15:53:59 UTC
Permalink
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Well, caching owner + seclabels + ACLs won't help either. What if user
loads some profile into AppArmor or something that denies previously
allowed access to /dev/kvm (or vice versa)? What I am saying is that
there are some security models which base their decisions on something
else than file attributes.
The virFileAccessibleAs check for /dev/kvm was put in their to ensure
that we correctly report usability of KVM in the capabilities XML
according to file permissions/ownership. Essentially KVM is not usable
until the udev rule is applied to change permissions to world accessible
or to set the kvm group.

I don't think we need to care aout selinux/apparmor restrictions - just
need to be no worse than what we cope with today, which is just perms
and owner/group.

Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Ján Tomko
2018-11-01 04:11:30 UTC
Permalink
Post by Daniel P. Berrangé
Post by Michal Privoznik
Well, caching owner + seclabels + ACLs won't help either. What if user
loads some profile into AppArmor or something that denies previously
allowed access to /dev/kvm (or vice versa)? What I am saying is that
there are some security models which base their decisions on something
else than file attributes.
The virFileAccessibleAs check for /dev/kvm was put in their to ensure
that we correctly report usability of KVM in the capabilities XML
according to file permissions/ownership. Essentially KVM is not usable
until the udev rule is applied to change permissions to world accessible
or to set the kvm group.
Can libvirt be run sooner than the permissions are set up during regular
start-up, or this is just for the case of the user randomly touching the
permissions?

IIRC the problem was that users with vmx disabled in BIOS setup needed
to delete libvirt's qemuCaps cache manually after enabling it even after
restarting the system.
To catch that, all we'd have to do is run the check once per daemon
lifetime,
Post by Daniel P. Berrangé
I don't think we need to care aout selinux/apparmor restrictions - just
need to be no worse than what we cope with today, which is just perms
and owner/group.
but that could be considered worse according to this metric.

Jano
Michal Privoznik
2018-11-01 08:07:05 UTC
Permalink
Post by Ján Tomko
Post by Daniel P. Berrangé
Post by Michal Privoznik
Well, caching owner + seclabels + ACLs won't help either. What if user
loads some profile into AppArmor or something that denies previously
allowed access to /dev/kvm (or vice versa)? What I am saying is that
there are some security models which base their decisions on something
else than file attributes.
The virFileAccessibleAs check for /dev/kvm was put in their to ensure
that we correctly report usability of KVM in the capabilities XML
according to file permissions/ownership. Essentially KVM is not usable
until the udev rule is applied to change permissions to world accessible
or to set the kvm group.
Can libvirt be run sooner than the permissions are set up during regular
start-up, or this is just for the case of the user randomly touching the
permissions?
Sure it can. And it is. For instance, I have ConsoleKit installed on my
system and when I log in as a regular user into my system, it adds an
ACL entry to /dev/kvm to allow access to the user. Libvirtd is already
running at that point (well, except not really because I don't run it as
a service since I'm running it from git all the time. But you get the
picture).
Post by Ján Tomko
IIRC the problem was that users with vmx disabled in BIOS setup needed
to delete libvirt's qemuCaps cache manually after enabling it even after
restarting the system.
Sounds like it. The commit that introduced the check is d87df9bd39b.

Honestly, I don't think we need to care about perms - we can assume
those are set properly (as they don't change often). What we have to
care about is inserting/removing kvm module (even though it's not
necessary these days since virtualbox has learned how to co-exist with
KVM).

Michal
Daniel P. Berrangé
2018-11-01 09:16:21 UTC
Permalink
Post by Michal Privoznik
Post by Ján Tomko
IIRC the problem was that users with vmx disabled in BIOS setup needed
to delete libvirt's qemuCaps cache manually after enabling it even after
restarting the system.
Sounds like it. The commit that introduced the check is d87df9bd39b.
Honestly, I don't think we need to care about perms - we can assume
those are set properly (as they don't change often). What we have to
We can't assume permissions are correct. One of the core issues we hit
was that the udev rule which sets the permissions has been in the QEMU
RPM historically. So you can have the kernel module loaded automatically
by systemd when it detects VMX, but its permissions will be 0600 until
the qemu-kvm RPM is installed and udev is triggered to fix the permissions
to 0666.
Post by Michal Privoznik
care about is inserting/removing kvm module (even though it's not
necessary these days since virtualbox has learned how to co-exist with
KVM).
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Martin Kletzander
2018-11-01 08:31:50 UTC
Permalink
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
ioctl(kvm, KVM_CREATE_VM, 0);
KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)
and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Does it? With this behaviour we still do the "expensive" work after any machine
has started. But for one machine starting it still has the effect of running it
only once. And we *need* to run it for each machine unless we also cache the
result per (at least) user:group of that machine as every machine can run under
different user:group.

I'll go through the patch again (just skimmed it the first time), but I think
this actually still makes it better (although not perfect). I wonder why the
udev rule does not fire only on change (why doesn't it say ACTION=="change").
Post by Daniel P. Berrangé
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
--
libvir-list mailing list
https://www.redhat.com/mailman/listinfo/libvir-list
Martin Kletzander
2018-11-01 09:04:09 UTC
Permalink
Post by Martin Kletzander
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
ioctl(kvm, KVM_CREATE_VM, 0);
KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)
and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Does it? With this behaviour we still do the "expensive" work after any machine
has started. But for one machine starting it still has the effect of running it
only once. And we *need* to run it for each machine unless we also cache the
result per (at least) user:group of that machine as every machine can run under
different user:group.
I'll go through the patch again (just skimmed it the first time), but I think
this actually still makes it better (although not perfect). I wonder why the
udev rule does not fire only on change (why doesn't it say ACTION=="change").
Of course I meant to write ACTION="add".
Post by Martin Kletzander
Post by Daniel P. Berrangé
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
--
libvir-list mailing list
https://www.redhat.com/mailman/listinfo/libvir-list
--
libvir-list mailing list
https://www.redhat.com/mailman/listinfo/libvir-list
Daniel P. Berrangé
2018-11-01 09:18:03 UTC
Permalink
Post by Martin Kletzander
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Michal Privoznik
ioctl(kvm, KVM_CREATE_VM, 0);
KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)
and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Does it? With this behaviour we still do the "expensive" work after any machine
has started. But for one machine starting it still has the effect of running it
only once. And we *need* to run it for each machine unless we also cache the
result per (at least) user:group of that machine as every machine can run under
different user:group.
Yes, with the current patch it still rechecks it for each VM startup, but
I was going to suggest the caching of this is simply put in a global static
variable as there's no reason to record this device permissions state in
the per VM caps cache.


Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Marc Hartmayer
2018-11-12 11:43:11 UTC
Permalink
Post by Daniel P. Berrangé
Post by Martin Kletzander
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Michal Privoznik
ioctl(kvm, KVM_CREATE_VM, 0);
KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)
and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Does it? With this behaviour we still do the "expensive" work after any machine
has started. But for one machine starting it still has the effect of running it
only once. And we *need* to run it for each machine unless we also cache the
result per (at least) user:group of that machine as every machine can run under
different user:group.
Yes, with the current patch it still rechecks it for each VM startup
But that happens only because of udev (see Michal’s answer).
Post by Daniel P. Berrangé
, but
I was going to suggest the caching of this is simply put in a global static
variable as there's no reason to record this device permissions state in
the per VM caps cache.
I’m not sure I understand you correctly, but this patch doesn’t use the
VM/QEMU capabilities cache (that would be 'struct _virQEMUCaps') for the
caching. Instead it uses 'struct _virQEMUCapsCachePriv' and this struct
is initialized only once when initializing the QEMU driver. This should
be pretty similar to your proposal with the global static variable.
Post by Daniel P. Berrangé
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
--
Kind regards / Beste Grüße
Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294
Daniel P. Berrangé
2018-11-12 12:36:38 UTC
Permalink
Post by Marc Hartmayer
Post by Daniel P. Berrangé
Post by Martin Kletzander
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Michal Privoznik
ioctl(kvm, KVM_CREATE_VM, 0);
KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)
and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Does it? With this behaviour we still do the "expensive" work after any machine
has started. But for one machine starting it still has the effect of running it
only once. And we *need* to run it for each machine unless we also cache the
result per (at least) user:group of that machine as every machine can run under
different user:group.
Yes, with the current patch it still rechecks it for each VM startup
But that happens only because of udev (see Michal’s answer).
Post by Daniel P. Berrangé
, but
I was going to suggest the caching of this is simply put in a global static
variable as there's no reason to record this device permissions state in
the per VM caps cache.
I’m not sure I understand you correctly, but this patch doesn’t use the
VM/QEMU capabilities cache (that would be 'struct _virQEMUCaps') for the
caching. Instead it uses 'struct _virQEMUCapsCachePriv' and this struct
is initialized only once when initializing the QEMU driver. This should
be pretty similar to your proposal with the global static variable.
Oh yes, I missed that. I think this is fine then, and I don't think we
need to deal with checking against arbitrary user ids


Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Marc Hartmayer
2018-11-12 11:50:41 UTC
Permalink
Post by Martin Kletzander
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
ioctl(kvm, KVM_CREATE_VM, 0);
KERNEL[3631.129645] change /devices/virtual/misc/kvm (misc)
UDEV [3631.130816] change /devices/virtual/misc/kvm (misc)
and stopping udevd leaves all three times untouched. So it is udev after
all. I just don't know how to find the rule that is causing the issue.
Anyway, as for this patch, I think we can merge it in the end, can't we?
Not really. Udev is in use everywhere, so this behaviour makes the
patch useless in practice, even though it is technically right in
theory :-(
Does it? With this behaviour we still do the "expensive" work after any machine
has started. But for one machine starting it still has the effect of running it
only once. And we *need* to run it for each machine unless we also cache the
result per (at least) user:group of that machine as every machine can run under
different user:group.
How can you run a machine/QEMU VM under a different user:group other
than changing the user:group in qemu.conf and restart/reload libvirtd?

As soon as a VM is running we have not to verify /dev/kvm access, no?
(so there should be no problem when libvirtd tries to “reconnect” to
already running VMs).
Post by Martin Kletzander
I'll go through the patch again (just skimmed it the first time), but I think
this actually still makes it better (although not perfect). I wonder why the
udev rule does not fire only on change (why doesn't it say ACTION=="change").
Post by Daniel P. Berrangé
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
--
libvir-list mailing list
https://www.redhat.com/mailman/listinfo/libvir-list
--
Kind regards / Beste Grüße
Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294
Pavel Hrdina
2018-11-12 12:30:44 UTC
Permalink
[...]
Post by Marc Hartmayer
How can you run a machine/QEMU VM under a different user:group other
than changing the user:group in qemu.conf and restart/reload libvirtd?
As soon as a VM is running we have not to verify /dev/kvm access, no?
(so there should be no problem when libvirtd tries to “reconnect” to
already running VMs).
You can add this into your domain XML:

<seclabel type='static' model='dac' relabel='yes'>
<label>phrdina:phrdina</label>
</seclabel>

And it will run the qemu process under that user.

Pavel
Marc Hartmayer
2018-11-12 13:48:09 UTC
Permalink
Post by Pavel Hrdina
[...]
Post by Marc Hartmayer
How can you run a machine/QEMU VM under a different user:group other
than changing the user:group in qemu.conf and restart/reload libvirtd?
As soon as a VM is running we have not to verify /dev/kvm access, no?
(so there should be no problem when libvirtd tries to “reconnect” to
already running VMs).
<seclabel type='static' model='dac' relabel='yes'>
<label>phrdina:phrdina</label>
</seclabel>
And it will run the qemu process under that user.
Interesting :) Actually, if we consider this then the QEMU caps caching
is broken anyway since 'virQEMUCapsNewData' is calling
'virQEMUCapsNewForBinaryInternal(…, priv->runUid, priv->runGid, …)'.

And 'priv->runUid/runGid' is only set once in virQEMUCapsCacheNew.

Maybe I missed something.
Post by Pavel Hrdina
Pavel
--
libvir-list mailing list
https://www.redhat.com/mailman/listinfo/libvir-list
--
Kind regards / Beste Grüße
Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294
Daniel P. Berrangé
2018-11-12 14:13:00 UTC
Permalink
Post by Marc Hartmayer
Post by Pavel Hrdina
[...]
Post by Marc Hartmayer
How can you run a machine/QEMU VM under a different user:group other
than changing the user:group in qemu.conf and restart/reload libvirtd?
As soon as a VM is running we have not to verify /dev/kvm access, no?
(so there should be no problem when libvirtd tries to “reconnect” to
already running VMs).
<seclabel type='static' model='dac' relabel='yes'>
<label>phrdina:phrdina</label>
</seclabel>
And it will run the qemu process under that user.
Interesting :) Actually, if we consider this then the QEMU caps caching
is broken anyway since 'virQEMUCapsNewData' is calling
'virQEMUCapsNewForBinaryInternal(…, priv->runUid, priv->runGid, …)'.
And 'priv->runUid/runGid' is only set once in virQEMUCapsCacheNew.
Maybe I missed something.
I dont think it is really broken - it merely impacts error reporting.

When running 'virsh capabilities' we are trying to figure out if
KVM is usable on the host. This is always using the default uid/gid
so gives a fairly coarse answer, but the answer is correct for most
common usage.

When building command line ARGV for spawning a specific QEMU
instance, the KVM capability merely affect whether libvirt
reports an error about the guest config. In the case where
the capability works with default uid/gid, but breaks with the
custom per-VM uid/gid, libvirt will mistakenly thing KVM is
usable and so launch the guest. QEMU will then be unable to
access it and quit with some suitable error message.

So the "brokenness" about not using the per-VM uid/gid merely
delays the error reporting. I don't think this is important
enough to worry about, using the default uid/gid is sufficient.

Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Marc Hartmayer
2018-11-12 14:50:03 UTC
Permalink
Post by Daniel P. Berrangé
Post by Marc Hartmayer
Post by Pavel Hrdina
[...]
Post by Marc Hartmayer
How can you run a machine/QEMU VM under a different user:group other
than changing the user:group in qemu.conf and restart/reload libvirtd?
As soon as a VM is running we have not to verify /dev/kvm access, no?
(so there should be no problem when libvirtd tries to “reconnect” to
already running VMs).
<seclabel type='static' model='dac' relabel='yes'>
<label>phrdina:phrdina</label>
</seclabel>
And it will run the qemu process under that user.
Interesting :) Actually, if we consider this then the QEMU caps caching
is broken anyway since 'virQEMUCapsNewData' is calling
'virQEMUCapsNewForBinaryInternal(…, priv->runUid, priv->runGid, …)'.
And 'priv->runUid/runGid' is only set once in virQEMUCapsCacheNew.
Maybe I missed something.
I dont think it is really broken - it merely impacts error reporting.
Yep, you’re right. The use of “broken” was clearly exaggerated by me.
Post by Daniel P. Berrangé
When running 'virsh capabilities' we are trying to figure out if
KVM is usable on the host. This is always using the default uid/gid
so gives a fairly coarse answer, but the answer is correct for most
common usage.
When building command line ARGV for spawning a specific QEMU
instance, the KVM capability merely affect whether libvirt
reports an error about the guest config. In the case where
the capability works with default uid/gid, but breaks with the
custom per-VM uid/gid, libvirt will mistakenly thing KVM is
usable and so launch the guest. QEMU will then be unable to
access it and quit with some suitable error message.
So the "brokenness" about not using the per-VM uid/gid merely
delays the error reporting. I don't think this is important
enough to worry about, using the default uid/gid is sufficient.
For this patch as well?
Post by Daniel P. Berrangé
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
--
Kind regards / Beste Grüße
Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294
Michal Privoznik
2018-11-01 15:37:09 UTC
Permalink
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
eg I vaguely recall a udev rule that resets permissions on device nodes
whenever an FD is closed, which might cause this kind of behaviour
After trying hard to find the rule that mangles the ctime I've found the
following bug in udev:

https://github.com/zippy2/systemd/commit/51915e4e6a64c581da321c25448d80626d8e8408?diff=unified

I've send that as a pull request to systemd. So after all, ctime might
be usable again.

Michal
Marc Hartmayer
2018-11-20 10:00:32 UTC
Permalink
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Daniel P. Berrangé
Post by Michal Privoznik
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
This doesn't feel right. /dev/kvm ctime is changed every time qemu is
started or powered off (try running stat over it before and after a
domain is started/shut off). So effectively we will fork more often than
we would think. Should we cache inode number instead? Because for all
that we care is simply if the file is there.
Urgh, that is a bit strange and not the usual semantics for timestamps :-(
Indeed.
Post by Daniel P. Berrangé
We can't stat the inode - the code was explicitly trying to cope with the
way /dev/kvm can change permissions when udev rules get applied. We would
have to compare the user, group, permissions mask and even ACL, or a hash
of those.
Well, we can use ctime as suggested and post a patch for kernel to fix
ctime behaviour. Until the patch is merged our behaviour would be
suboptimal, but still better than it is now.
I guess lets talk to KVM team for their input on this and then decide
what todo.
Hmm, I wonder if it is not actually a kernel problem, but rather something
in userspace genuinely touching the device in a way that caues these
timestamps to be updated.
eg I vaguely recall a udev rule that resets permissions on device nodes
whenever an FD is closed, which might cause this kind of behaviour
After trying hard to find the rule that mangles the ctime I've found the
https://github.com/zippy2/systemd/commit/51915e4e6a64c581da321c25448d80626d8e8408?diff=unified
I've send that as a pull request to systemd. So after all, ctime might
be usable again.
Are there any other objections against this patch? Because even with the
ctime bug in udev it’s performs _much_ better than before.
Post by Michal Privoznik
Michal
--
Kind regards / Beste Grüße
Marc Hartmayer

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschäftsführung: Dirk Wittkopp
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Martin Kletzander
2018-11-01 09:03:45 UTC
Permalink
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
diff --git a/src/qemu/qemu_capabilities.c b/src/qemu/qemu_capabilities.c
index e228f52ec0bb..85516954149b 100644
--- a/src/qemu/qemu_capabilities.c
+++ b/src/qemu/qemu_capabilities.c
@@ -3238,6 +3238,10 @@ struct _virQEMUCapsCachePriv {
virArch hostArch;
unsigned int microcodeVersion;
char *kernelVersion;
+
+ /* cache whether /dev/kvm is usable as runUid:runGuid */
Even though this doesn't solve all of it (the ctime change with udev etc.) it
makes it better. However this needs to be checked (or cached) per seclabel of
the domain as it can run under non-default user.
Post by Marc Hartmayer
+ virTristateBool kvmUsable;
+ time_t kvmCtime;
};
typedef struct _virQEMUCapsCachePriv virQEMUCapsCachePriv;
typedef virQEMUCapsCachePriv *virQEMUCapsCachePrivPtr;
@@ -3824,6 +3828,52 @@ virQEMUCapsSaveFile(void *data,
}
+/* Determine whether '/dev/kvm' is usable as QEMU user:QEMU group. */
+static bool
+virQEMUCapsKVMUsable(virQEMUCapsCachePrivPtr priv)
+{
+ struct stat sb;
+ static const char *kvm_device = "/dev/kvm";
+ virTristateBool value;
+ virTristateBool cached_value = priv->kvmUsable;
+ time_t kvm_ctime;
+ time_t cached_kvm_ctime = priv->kvmCtime;
+
+ if (stat(kvm_device, &sb) < 0) {
+ virReportSystemError(errno,
+ _("Failed to stat %s"), kvm_device);
+ return false;
+ }
+ kvm_ctime = sb.st_ctime;
+
+ if (kvm_ctime != cached_kvm_ctime) {
+ VIR_DEBUG("%s has changed (%lld vs %lld)", kvm_device,
+ (long long)kvm_ctime, (long long)cached_kvm_ctime);
+ cached_value = VIR_TRISTATE_BOOL_ABSENT;
+ }
+
+ if (cached_value != VIR_TRISTATE_BOOL_ABSENT)
+ return cached_value == VIR_TRISTATE_BOOL_YES;
+
+ if (virFileAccessibleAs(kvm_device, R_OK | W_OK,
+ priv->runUid, priv->runGid) == 0) {
+ value = VIR_TRISTATE_BOOL_YES;
+ } else {
+ value = VIR_TRISTATE_BOOL_NO;
+ }
+
+ /* There is a race window between 'stat' and
+ * 'virFileAccessibleAs'. However, since we're only interested in
+ * detecting changes *after* the virFileAccessibleAs check, we can
+ * neglect this here.
+ */
+ priv->kvmCtime = kvm_ctime;
+ priv->kvmUsable = value;
+
+ return value == VIR_TRISTATE_BOOL_YES;
+}
+
+
static bool
virQEMUCapsIsValid(void *data,
void *privData)
@@ -3872,8 +3922,7 @@ virQEMUCapsIsValid(void *data,
return true;
}
- kvmUsable = virFileAccessibleAs("/dev/kvm", R_OK | W_OK,
- priv->runUid, priv->runGid) == 0;
+ kvmUsable = virQEMUCapsKVMUsable(priv);
if (!virQEMUCapsGet(qemuCaps, QEMU_CAPS_KVM) &&
kvmUsable) {
@@ -4684,6 +4733,7 @@ virQEMUCapsCacheNew(const char *libDir,
priv->runUid = runUid;
priv->runGid = runGid;
priv->microcodeVersion = microcodeVersion;
+ priv->kvmUsable = VIR_TRISTATE_BOOL_ABSENT;
if (uname(&uts) == 0 &&
virAsprintf(&priv->kernelVersion, "%s %s", uts.release, uts.version) < 0)
--
2.17.0
--
libvir-list mailing list
https://www.redhat.com/mailman/listinfo/libvir-list
Daniel P. Berrangé
2018-11-01 13:37:47 UTC
Permalink
Post by Marc Hartmayer
Introduce caching whether /dev/kvm is usable as the QEMU user:QEMU
group. This reduces the overhead of the QEMU capabilities cache
lookup. Before this patch there were many fork() calls used for
checking whether /dev/kvm is accessible. Now we store the result
whether /dev/kvm is accessible or not and we only need to re-run the
virFileAccessibleAs check if the ctime of /dev/kvm has changed.
---
src/qemu/qemu_capabilities.c | 54 ++++++++++++++++++++++++++++++++++--
1 file changed, 52 insertions(+), 2 deletions(-)
For benefit of those who were not at KVM Forum last week, this patch
is addressing the performance problem described starting here:




Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Loading...