Discussion:
[libvirt] [PATCH] On Xen, domains may be destroyed without libvirt being involved
Jim Fehlig
2018-12-10 18:01:45 UTC
Permalink
This can happen via `xl destroy`, for example.  When this happens,
libvirt is stuck in an inconsistent state: libvirt believes the domain
is still running, but attempts to use libvirt’s APIs to shutdown the
domain fail.  The only way out of this situation is to restart libvirt.
Marek asked about this last Friday

https://www.redhat.com/archives/libvir-list/2018-December/msg00196.html

and then spent the day creating a patch :-)

https://www.redhat.com/archives/libvir-list/2018-December/msg00212.html

I'll review/test his patches today, but thank you for taking a stab at this problem!

Regards,
Jim
To prevent this from happening, process LIBXL_EVENT_TYPE_DOMAIN_DEATH as
well as LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN, but only if libvirt has not
already begun to destroy the domain.
---
 src/conf/domain_conf.h   |  4 ++++
 src/libxl/libxl_domain.c | 24 +++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index b24e6ec3de..d3520bde15 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2620,6 +2620,10 @@ struct _virDomainObj {
     unsigned int updated : 1;
     unsigned int removing : 1;
+    /* Only used by the Xen backend */
+    unsigned int being_destroyed_by_libvirt : 1;
+    unsigned int already_destroyed : 1;
+
     virDomainDefPtr def; /* The current definition */
     virDomainDefPtr newDef; /* New definition to activate at shutdown */
diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
index 5fe3f44fbe..680e5f209f 100644
--- a/src/libxl/libxl_domain.c
+++ b/src/libxl/libxl_domain.c
@@ -482,9 +482,21 @@ libxlDomainShutdownThread(void *opaque)
         goto cleanup;
     }
+    VIR_INFO("Domain %d died", event->domid);
+
     if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0)
         goto cleanup;
+    if (LIBXL_EVENT_TYPE_DOMAIN_DEATH == ev->type) {
+        if (vm->being_destroyed_by_libvirt) {
+            VIR_INFO("VM %d already being destroyed by libvirt",
event->domid);
+            goto cleanup;
+        }
+
+        VIR_INFO("Marking VM %d as already destroyed", event->domid);
+        vm->already_destroyed = true;
+    }
+
     if (xl_reason == LIBXL_SHUTDOWN_REASON_POWEROFF) {
         virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF,
                              VIR_DOMAIN_SHUTOFF_SHUTDOWN);
@@ -620,7 +632,8 @@ libxlDomainEventHandler(void *data,
VIR_LIBXL_EVENT_CONST libxl_event *event)
     virThread thread;
     libxlDriverConfigPtr cfg;
-    if (event->type != LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) {
+    if (LIBXL_EVENT_TYPE_DOMAIN_DEATH != event->type &&
+        LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN != event->type) {
         VIR_INFO("Unhandled event type %d", event->type);
         goto error;
     }
@@ -629,18 +642,16 @@ libxlDomainEventHandler(void *data,
VIR_LIBXL_EVENT_CONST libxl_event *event)
      * Start a thread to handle shutdown.  We don't want to be tying up
      * libxl's event machinery by doing a potentially lengthy shutdown.
      */
-    if (VIR_ALLOC(shutdown_info) < 0)
-        goto error;
+    while (VIR_ALLOC(shutdown_info) < 0) {}
     shutdown_info->driver = driver;
     shutdown_info->event = (libxl_event *)event;
-    if (virThreadCreate(&thread, false, libxlDomainShutdownThread,
+    while (virThreadCreate(&thread, false, libxlDomainShutdownThread,
                         shutdown_info) < 0) {
         /*
          * Not much we can do on error here except log it.
          */
         VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
-        goto error;
     }
     /*
@@ -752,6 +763,9 @@ libxlDomainDestroyInternal(libxlDriverPrivatePtr driver,
 {
     libxlDriverConfigPtr cfg = libxlDriverConfigGet(driver);
     int ret = -1;
+    if (vm->already_destroyed)
+        return -1;
+    vm->being_destroyed_by_libvirt = true;
     /* Unlock virDomainObj during destroy, which can take considerable
      * time on large memory domains.
--
libvir-list mailing list
https://www.redhat.com/mailman/listinfo/libvir-list
Peter Krempa
2018-12-10 13:42:54 UTC
Permalink
This can happen via `xl destroy`, for example.  When this happens,
libvirt is stuck in an inconsistent state: libvirt believes the domain
is still running, but attempts to use libvirt’s APIs to shutdown the
domain fail.  The only way out of this situation is to restart libvirt.
To prevent this from happening, process LIBXL_EVENT_TYPE_DOMAIN_DEATH as
well as LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN, but only if libvirt has not
already begun to destroy the domain.
---
 src/conf/domain_conf.h   |  4 ++++
 src/libxl/libxl_domain.c | 24 +++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index b24e6ec3de..d3520bde15 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2620,6 +2620,10 @@ struct _virDomainObj {
     unsigned int updated : 1;
     unsigned int removing : 1;
 
+    /* Only used by the Xen backend */
+    unsigned int being_destroyed_by_libvirt : 1;
+    unsigned int already_destroyed : 1;
Please put this into 'struct _libxlDomainObjPrivate' if it's going to be
only used by the libxl driver.

Also the 'bool' type should be used.
+
     virDomainDefPtr def; /* The current definition */
     virDomainDefPtr newDef; /* New definition to activate at shutdown */
 
Daniel P. Berrangé
2018-12-10 13:45:39 UTC
Permalink
This can happen via `xl destroy`, for example.  When this happens,
libvirt is stuck in an inconsistent state: libvirt believes the domain
is still running, but attempts to use libvirt’s APIs to shutdown the
domain fail.  The only way out of this situation is to restart libvirt.
To prevent this from happening, process LIBXL_EVENT_TYPE_DOMAIN_DEATH as
well as LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN, but only if libvirt has not
already begun to destroy the domain.
---
 src/conf/domain_conf.h   |  4 ++++
 src/libxl/libxl_domain.c | 24 +++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h
index b24e6ec3de..d3520bde15 100644
--- a/src/conf/domain_conf.h
+++ b/src/conf/domain_conf.h
@@ -2620,6 +2620,10 @@ struct _virDomainObj {
     unsigned int updated : 1;
     unsigned int removing : 1;
 
+    /* Only used by the Xen backend */
+    unsigned int being_destroyed_by_libvirt : 1;
+    unsigned int already_destroyed : 1;
This ought to go in the _libxlDomainObjPrivate struct instead
diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
index 5fe3f44fbe..680e5f209f 100644
--- a/src/libxl/libxl_domain.c
+++ b/src/libxl/libxl_domain.c
@@ -482,9 +482,21 @@ libxlDomainShutdownThread(void *opaque)
         goto cleanup;
     }
 
+    VIR_INFO("Domain %d died", event->domid);
+
     if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0)
         goto cleanup;
 
+    if (LIBXL_EVENT_TYPE_DOMAIN_DEATH == ev->type) {
Normal style is to have variable name first, constant second
+        if (vm->being_destroyed_by_libvirt) {
+            VIR_INFO("VM %d already being destroyed by libvirt",
event->domid);
The patch got mangled by your mail client i'm afraid.
+            goto cleanup;
+        }
+
+        VIR_INFO("Marking VM %d as already destroyed", event->domid);
+        vm->already_destroyed = true;
Using 'true' for an "unsigned int :1' bitfield is not valid.
You would nede to declare the variable as a bool, or use an
integer value here.
+    }
+
     if (xl_reason == LIBXL_SHUTDOWN_REASON_POWEROFF) {
         virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF,
                              VIR_DOMAIN_SHUTOFF_SHUTDOWN);
@@ -620,7 +632,8 @@ libxlDomainEventHandler(void *data,
VIR_LIBXL_EVENT_CONST libxl_event *event)
     virThread thread;
     libxlDriverConfigPtr cfg;
 
-    if (event->type != LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) {
+    if (LIBXL_EVENT_TYPE_DOMAIN_DEATH != event->type &&
+        LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN != event->type) {
Again, this is contrary to normal libvirt style, so please put it
back to variable name first.
         VIR_INFO("Unhandled event type %d", event->type);
         goto error;
     }
@@ -629,18 +642,16 @@ libxlDomainEventHandler(void *data,
VIR_LIBXL_EVENT_CONST libxl_event *event)
      * Start a thread to handle shutdown.  We don't want to be tying up
      * libxl's event machinery by doing a potentially lengthy shutdown.
      */
-    if (VIR_ALLOC(shutdown_info) < 0)
-        goto error;
+    while (VIR_ALLOC(shutdown_info) < 0) {}
Huh ? This is busy-waiting when getting OOM which is definitely
not what we want.
 
     shutdown_info->driver = driver;
     shutdown_info->event = (libxl_event *)event;
-    if (virThreadCreate(&thread, false, libxlDomainShutdownThread,
+    while (virThreadCreate(&thread, false, libxlDomainShutdownThread,
                         shutdown_info) < 0) {
Again, busy-waiting when failin to spawn threads.
         /*
          * Not much we can do on error here except log it.
          */
         VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
-        goto error;
     }
 
     /*
@@ -752,6 +763,9 @@ libxlDomainDestroyInternal(libxlDriverPrivatePtr driver,
 {
     libxlDriverConfigPtr cfg = libxlDriverConfigGet(driver);
     int ret = -1;
+    if (vm->already_destroyed)
+        return -1;
+    vm->being_destroyed_by_libvirt = true;
Again, using a bool "true" with an unsigned int bitfied.
 
     /* Unlock virDomainObj during destroy, which can take considerable
      * time on large memory domains.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
Loading...