From ab794fa7413d1650c1ba01ea18e74964877fdf99 Mon Sep 17 00:00:00 2001
From: Todd Gamblin <tgamblin@llnl.gov>
Date: Sat, 30 Jun 2018 22:05:50 -0700
Subject: [PATCH] locks: llnl.util.lock now only writes host info when in debug
 mode

- write locks previously wrote information about the lock holder (host
  and pid), and read locks woudl read this in.

- This is really only for debugging, so only enable it then

- add some tests that target debug info, and improve multiproc lock test
  output
---
 lib/spack/llnl/util/lock.py            | 27 ++++++---
 lib/spack/spack/test/llnl/util/lock.py | 84 ++++++++++++++++++++++++--
 2 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/lib/spack/llnl/util/lock.py b/lib/spack/llnl/util/lock.py
index 0d072418cc..794b201668 100644
--- a/lib/spack/llnl/util/lock.py
+++ b/lib/spack/llnl/util/lock.py
@@ -51,7 +51,7 @@ class Lock(object):
     is enabled) and recent NFS versions.
     """
 
-    def __init__(self, path, start=0, length=0):
+    def __init__(self, path, start=0, length=0, debug=False):
         """Construct a new lock on the file at ``path``.
 
         By default, the lock applies to the whole file.  Optionally,
@@ -72,7 +72,10 @@ def __init__(self, path, start=0, length=0):
         self._start = start
         self._length = length
 
-        # PID and host of lock holder
+        # enable debug mode
+        self.debug = debug
+
+        # PID and host of lock holder (only used in debug mode)
         self.pid = self.old_pid = None
         self.host = self.old_host = None
 
@@ -118,12 +121,14 @@ def _lock(self, op, timeout=_default_timeout):
                 fcntl.lockf(self._file, op | fcntl.LOCK_NB,
                             self._length, self._start, os.SEEK_SET)
 
-                # All locks read the owner PID and host
-                self._read_lock_data()
+                # help for debugging distributed locking
+                if self.debug:
+                    # All locks read the owner PID and host
+                    self._read_debug_data()
 
-                # Exclusive locks write their PID/host
-                if op == fcntl.LOCK_EX:
-                    self._write_lock_data()
+                    # Exclusive locks write their PID/host
+                    if op == fcntl.LOCK_EX:
+                        self._write_debug_data()
 
                 return
 
@@ -148,15 +153,19 @@ def _ensure_parent_directory(self):
                     e.errno == errno.EISDIR):
                 raise
 
-    def _read_lock_data(self):
+    def _read_debug_data(self):
         """Read PID and host data out of the file if it is there."""
+        self.old_pid = self.pid
+        self.old_host = self.host
+
         line = self._file.read()
         if line:
             pid, host = line.strip().split(',')
             _, _, self.pid = pid.rpartition('=')
             _, _, self.host = host.rpartition('=')
+            self.pid = int(self.pid)
 
-    def _write_lock_data(self):
+    def _write_debug_data(self):
         """Write PID and host data to the file, recording old values."""
         self.old_pid = self.pid
         self.old_host = self.host
diff --git a/lib/spack/spack/test/llnl/util/lock.py b/lib/spack/spack/test/llnl/util/lock.py
index 7c38a2547f..5127462c58 100644
--- a/lib/spack/spack/test/llnl/util/lock.py
+++ b/lib/spack/spack/test/llnl/util/lock.py
@@ -62,13 +62,14 @@
 
 """
 import os
+import socket
 import shutil
 import tempfile
 import traceback
 import glob
 import getpass
 from contextlib import contextmanager
-from multiprocessing import Process
+from multiprocessing import Process, Queue
 
 import pytest
 
@@ -201,17 +202,21 @@ def lock_path(lock_dir):
         os.unlink(lock_file)
 
 
-def local_multiproc_test(*functions):
+def local_multiproc_test(*functions, **kwargs):
     """Order some processes using simple barrier synchronization."""
     b = mp.Barrier(len(functions), timeout=barrier_timeout)
-    procs = [Process(target=f, args=(b,)) for f in functions]
+
+    args = (b,) + tuple(kwargs.get('extra_args', ()))
+    procs = [Process(target=f, args=args, name=f.__name__)
+             for f in functions]
 
     for p in procs:
         p.start()
 
     for p in procs:
         p.join()
-        assert p.exitcode == 0
+
+    assert all(p.exitcode == 0 for p in procs)
 
 
 def mpi_multiproc_test(*functions):
@@ -907,3 +912,74 @@ def do_write_with_exception(exit_fn):
     assert vals['exception']
     assert not vals['exited_fn']
     assert not vals['exception_fn']
+
+
+def test_lock_debug_output(lock_path):
+    host = socket.getfqdn()
+
+    def p1(barrier, q1, q2):
+        # exchange pids
+        p1_pid = os.getpid()
+        q1.put(p1_pid)
+        p2_pid = q2.get()
+
+        # set up lock
+        lock = lk.Lock(lock_path, debug=True)
+
+        with lk.WriteTransaction(lock):
+            # p1 takes write lock and writes pid/host to file
+            barrier.wait()  # ------------------------------------ 1
+
+        assert lock.pid == p1_pid
+        assert lock.host == host
+
+        # wait for p2 to verify contents of file
+        barrier.wait()  # ---------------------------------------- 2
+
+        # wait for p2 to take a write lock
+        barrier.wait()  # ---------------------------------------- 3
+
+        # verify pid/host info again
+        with lk.ReadTransaction(lock):
+            assert lock.old_pid == p1_pid
+            assert lock.old_host == host
+
+            assert lock.pid == p2_pid
+            assert lock.host == host
+
+        barrier.wait()  # ---------------------------------------- 4
+
+    def p2(barrier, q1, q2):
+        # exchange pids
+        p2_pid = os.getpid()
+        p1_pid = q1.get()
+        q2.put(p2_pid)
+
+        # set up lock
+        lock = lk.Lock(lock_path, debug=True)
+
+        # p1 takes write lock and writes pid/host to file
+        barrier.wait()  # ---------------------------------------- 1
+
+        # verify that p1 wrote information to lock file
+        with lk.ReadTransaction(lock):
+            assert lock.pid == p1_pid
+            assert lock.host == host
+
+        barrier.wait()  # ---------------------------------------- 2
+
+        # take a write lock on the file and verify pid/host info
+        with lk.WriteTransaction(lock):
+            assert lock.old_pid == p1_pid
+            assert lock.old_host == host
+
+            assert lock.pid == p2_pid
+            assert lock.host == host
+
+            barrier.wait()  # ------------------------------------ 3
+
+        # wait for p1 to verify pid/host info
+        barrier.wait()  # ---------------------------------------- 4
+
+    q1, q2 = Queue(), Queue()
+    local_multiproc_test(p2, p1, extra_args=(q1, q2))
-- 
GitLab