pax_global_header00006660000000000000000000000064137142761220014517gustar00rootroot0000000000000052 comment=01b727adf1ce3c0ed5b95299d60f62bc66d98eb5 sanlock-3.8.2/000077500000000000000000000000001371427612200131635ustar00rootroot00000000000000sanlock-3.8.2/.gitattributes000066400000000000000000000001251371427612200160540ustar00rootroot00000000000000# Set the default behavior, in case people don't have core.autocrlf set. * text=auto sanlock-3.8.2/.gitignore000066400000000000000000000006051371427612200151540ustar00rootroot00000000000000*.co *.swp *.tar.gz .tox/ .vimdir cscope.* python/*.so python/build/ python/usr/ reset/sanlk-reset reset/sanlk-resetd sanlock.spec src/libsanlock.pc src/libsanlock.so* src/libsanlock_client.pc src/libsanlock_client.so* src/sanlock tests/*.pyc tests/__pycache__/ tests/devcount tests/killpath tests/sanlk_client tests/sanlk_load tests/sanlk_path wdmd/libwdmd.so* wdmd/wdmd wdmd/wdmd_client sanlock-3.8.2/.travis.yml000066400000000000000000000005121371427612200152720ustar00rootroot00000000000000dist: xenial language: python python: - "3.6" - "3.7" - "3.8" - "3.9-dev" addons: apt: packages: - gcc - libaio-dev - libblkid-dev - make install: - pip install flake8 script: - make BUILDARGS="--build-lib=." - source tests/env.sh - pytest - flake8 --statistics tests python sanlock-3.8.2/.vimrc000066400000000000000000000001251371427612200143020ustar00rootroot00000000000000" Local vim configuration for sanlock set noexpandtab set shiftwidth=8 set tabstop=8 sanlock-3.8.2/Makefile000066400000000000000000000015621371427612200146270ustar00rootroot00000000000000version := $(shell cat VERSION) ifeq ($(shell git describe --exact-match 2>/dev/null),) # sanlock-3.7.0-5-g11fb098 -> 5.g11fb098 release := $(shell git describe --tags | awk -F- '{print $$(NF-1) "." $$(NF)}') else release := 0 endif distname := sanlock-$(version) tarball := $(distname).tar.gz SUBDIRS = wdmd src python reset .PHONY: all $(SUBDIRS) clean install all: $(SUBDIRS) $(SUBDIRS): $(MAKE) -C $@ src: wdmd python reset: src clean install: for dir in $(SUBDIRS); do \ $(MAKE) -C $$dir $@; \ done dist: spec rm -f $(tarball) git archive --prefix=$(distname)/ HEAD > $(distname).tar tar rf $(distname).tar --transform="s|^|$(distname)/&|" sanlock.spec gzip $(distname).tar srpm: dist rpmbuild -ts $(tarball) rpm: dist rpmbuild -ta $(tarball) spec: sed -e 's/@VERSION@/$(version)/g' \ -e 's/@RELEASE@/$(release)/g' \ sanlock.spec.in > sanlock.spec sanlock-3.8.2/README.dev000066400000000000000000000031701371427612200146210ustar00rootroot00000000000000Prerequisites ============= For Fedora/CentOS install the following packages: $ sudo yum install -y gcc make libaio-devel libblkid-devel For Ubuntu/Debian install the following packages: $ sudo apt install -y gcc make libaio-dev libblkid-dev How to test sanlock =================== To run the python based tests, you need tox. The best way to install a recent version is to use pip: $ pip install tox To run the tests with python 2.7 and 3.6: $ tox Note: python 3.6 tests will fail now, since sanlock extension module needs changes to compile on python 3. To run only python 2.7: $ tox -e py27 To run only test from some modules: $ tox tests/daemon_test.py To run only tests matching the substring "foo": $ tox -- -k foo To run basic lint and style check: $ tox -e flake8 Sometimes when debugging failing tests verbose mode is useful. You can enable it using: $ tox -e py36 tests/python_test.py -- -vv Or using environment variable: export PYTEST_ADDOPTS=-vv Testing 4K support ================== To enable the 4k tests, you need to setup 4k stroage for the tests: $ python tests/storage.py setup This creates two loop devices with 4k sector size; one for testing sanlock with 4k block device, and the other for testing with a filesystem backed by a 4k block device. To teardown the storage: $ python tests/storage.py teardown The script unmounts the filesystem and detaches the loop devices. The storage helper script uses sudo to perform privileged operations. The best way to use it is to setup the environment once at the start of the session, and teardown when you finish. sanlock-3.8.2/README.license000066400000000000000000000006021371427612200154620ustar00rootroot00000000000000LGPLv2+ src/libsanlock_client.so src/sanlock.h src/sanlock_rv.h src/sanlock_admin.h src/sanlock_resource.h src/sanlock_sock.h src/sanlock_sock.c src/client.c wdmd/libwdmd.so wdmd/wdmd.h wdmd/wdmd_sock.h wdmd/wdmd_sock.c wdmd/client.c GPLv2 src/list.h (copied from linux kernel) GPLv2+ src/crc32c.c (copied from btrfs-progs which copied from linux kernel) all other original files sanlock-3.8.2/README.rst000066400000000000000000001513471371427612200146650ustar00rootroot00000000000000See https://pagure.io/sanlock Mailing list https://lists.fedorahosted.org/admin/lists/sanlock-devel.lists.fedorahosted.org/ From sanlock(8) at sanlock.git/src/sanlock.8 :: SANLOCK(8) System Manager's Manual SANLOCK(8) NAME sanlock - shared storage lock manager SYNOPSIS sanlock [COMMAND] [ACTION] ... DESCRIPTION sanlock is a lock manager built on shared storage. Hosts with access to the storage can perform locking. An application running on the hosts is given a small amount of space on the shared block device or file, and uses sanlock for its own application-specific synchroniza‐ tion. Internally, the sanlock daemon manages locks using two disk- based lease algorithms: delta leases and paxos leases. · delta leases are slow to acquire and demand regular i/o to shared storage. sanlock only uses them internally to hold a lease on its "host_id" (an integer host identifier from 1-2000). They prevent two hosts from using the same host identifier. The delta lease renewals also indicate if a host is alive. ("Light-Weight Leases for Storage- Centric Coordination", Chockler and Malkhi.) · paxos leases are fast to acquire and sanlock makes them available to applications as general purpose resource leases. The disk paxos algorithm uses host_id's internally to represent different hosts, and the owner of a paxos lease. delta leases provide unique host_id's for implementing paxos leases, and delta lease renewals serve as a proxy for paxos lease renewal. ("Disk Paxos", Eli Gafni and Leslie Lamport.) Externally, the sanlock daemon exposes a locking interface through lib‐ sanlock in terms of "lockspaces" and "resources". A lockspace is a locking context that an application creates for itself on shared stor‐ age. When the application on each host is started, it "joins" the lockspace. It can then create "resources" on the shared storage. Each resource represents an application-specific entity. The application can acquire and release leases on resources. To use sanlock from an application: · Allocate shared storage for an application, e.g. a shared LUN or LV from a SAN, or files from NFS. · Provide the storage to the application. · The application uses this storage with libsanlock to create a lockspace and resources for itself. · The application joins the lockspace when it starts. · The application acquires and releases leases on resources. How lockspaces and resources translate to delta leases and paxos leases within sanlock: Lockspaces · A lockspace is based on delta leases held by each host using the lockspace. · A lockspace is a series of 2000 delta leases on disk, and requires 1MB of storage. (See Storage below for size variations.) · A lockspace can support up to 2000 concurrent hosts using it, each using a different delta lease. · Applications can i) create, ii) join and iii) leave a lockspace, which corresponds to i) initializing the set of delta leases on disk, ii) acquiring one of the delta leases and iii) releasing the delta lease. · When a lockspace is created, a unique lockspace name and disk loca‐ tion is provided by the application. · When a lockspace is created/initialized, sanlock formats the sequence of 2000 on-disk delta lease structures on the file or disk, e.g. /mnt/leasefile (NFS) or /dev/vg/lv (SAN). · The 2000 individual delta leases in a lockspace are identified by number: 1,2,3,...,2000. · Each delta lease is a 512 byte sector in the 1MB lockspace, offset by its number, e.g. delta lease 1 is offset 0, delta lease 2 is offset 512, delta lease 2000 is offset 1023488. (See Storage below for size variations.) · When an application joins a lockspace, it must specify the lockspace name, the lockspace location on shared disk/file, and the local host's host_id. sanlock then acquires the delta lease corresponding to the host_id, e.g. joining the lockspace with host_id 1 acquires delta lease 1. · The terms delta lease, lockspace lease, and host_id lease are used interchangably. · sanlock acquires a delta lease by writing the host's unique name to the delta lease disk sector, reading it back after a delay, and veri‐ fying it is the same. · If a unique host name is not specified, sanlock generates a uuid to use as the host's name. The delta lease algorithm depends on hosts using unique names. · The application on each host should be configured with a unique host_id, where the host_id is an integer 1-2000. · If hosts are misconfigured and have the same host_id, the delta lease algorithm is designed to detect this conflict, and only one host will be able to acquire the delta lease for that host_id. · A delta lease ensures that a lockspace host_id is being used by a single host with the unique name specified in the delta lease. · Resolving delta lease conflicts is slow, because the algorithm is based on waiting and watching for some time for other hosts to write to the same delta lease sector. If multiple hosts try to use the same delta lease, the delay is increased substantially. So, it is best to configure applications to use unique host_id's that will not conflict. · After sanlock acquires a delta lease, the lease must be renewed until the application leaves the lockspace (which corresponds to releasing the delta lease on the host_id.) · sanlock renews delta leases every 20 seconds (by default) by writing a new timestamp into the delta lease sector. · When a host acquires a delta lease in a lockspace, it can be referred to as "joining" the lockspace. Once it has joined the lockspace, it can use resources associated with the lockspace. Resources · A lockspace is a context for resources that can be locked and unlocked by an application. · sanlock uses paxos leases to implement leases on resources. The terms paxos lease and resource lease are used interchangably. · A paxos lease exists on shared storage and requires 1MB of space. It contains a unique resource name and the name of the lockspace. · An application assigns its own meaning to a sanlock resource and the leases on it. A sanlock resource could represent some shared object like a file, or some unique role among the hosts. · Resource leases are associated with a specific lockspace and can only be used by hosts that have joined that lockspace (they are holding a delta lease on a host_id in that lockspace.) · An application must keep track of the disk locations of its lockspaces and resources. sanlock does not maintain any persistent index or directory of lockspaces or resources that have been created by applications, so applications need to remember where they have placed their own leases (which files or disks and offsets). · sanlock does not renew paxos leases directly (although it could). Instead, the renewal of a host's delta lease represents the renewal of all that host's paxos leases in the associated lockspace. In effect, many paxos lease renewals are factored out into one delta lease renewal. This reduces i/o when many paxos leases are used. · The disk paxos algorithm allows multiple hosts to all attempt to acquire the same paxos lease at once, and will produce a single win‐ ner/owner of the resource lease. (Shared resource leases are also possible in addition to the default exclusive leases.) · The disk paxos algorithm involves a specific sequence of reading and writing the sectors of the paxos lease disk area. Each host has a dedicated 512 byte sector in the paxos lease disk area where it writes its own "ballot", and each host reads the entire disk area to see the ballots of other hosts. The first sector of the disk area is the "leader record" that holds the result of the last paxos ballot. The winner of the paxos ballot writes the result of the ballot to the leader record (the winner of the ballot may have selected another contending host as the owner of the paxos lease.) · After a paxos lease is acquired, no further i/o is done in the paxos lease disk area. · Releasing the paxos lease involves writing a single sector to clear the current owner in the leader record. · If a host holding a paxos lease fails, the disk area of the paxos lease still indicates that the paxos lease is owned by the failed host. If another host attempts to acquire the paxos lease, and finds the lease is held by another host_id, it will check the delta lease of that host_id. If the delta lease of the host_id is being renewed, then the paxos lease is owned and cannot be acquired. If the delta lease of the owner's host_id has expired, then the paxos lease is expired and can be taken (by going through the paxos lease algo‐ rithm.) · The "interaction" or "awareness" between hosts of each other is lim‐ ited to the case where they attempt to acquire the same paxos lease, and need to check if the referenced delta lease has expired or not. · When hosts do not attempt to lock the same resources concurrently, there is no host interaction or awareness. The state or actions of one host have no effect on others. · To speed up checking delta lease expiration (in the case of a paxos lease conflict), sanlock keeps track of past renewals of other delta leases in the lockspace. Resource Index The resource index (rindex) is an optional sanlock feature that appli‐ cations can use to keep track of resource lease offsets. Without the rindex, an application must keep track of where its resource leases exist on disk and find available locations when creating new leases. The sanlock rindex uses two align-size areas on disk following the lockspace. The first area holds rindex entries; each entry records a resource lease name and location. The second area holds a private paxos lease, used by sanlock internally to protect rindex updates. The application creates the rindex on disk with the "format" function. Format is a disk-only operation and does not interact with the live lockspace, so it can be called without first calling add_lockspace. The application needs to follow the convention of writing the lockspace at the start of the device (offset 0) and formatting the rindex immedi‐ ately following the lockspace area. When formatting, the application must set flags for sector size and align size to match those for the lockspace. To use the rindex, the application: · Uses the "create" function to create a new resource lease on disk. This takes the place of the write_resource function. The create function requires the location of the rindex and the name of the new resource lease. sanlock finds a free lease area, writes the new resource lease at that location, updates the rindex with the name:offset, and returns the offset to the caller. The caller uses this offset when acquiring the resource lease. · Uses the "delete" function to remove a resource disk on disk (also corresponding to the write_resource function.) sanlock clears the resource lease and the rindex entry for it. A subsequent call to create may use this same disk location for a different resource lease. · Uses the "lookup" function to discover the offset of a resource lease given the resource lease name. The caller would typically call this prior to acquiring the resource lease. · Uses the "rebuild" function to recreate the rindex if it is damaged or becomes inconsistent. This function scans the disk for resource leases and creates new rindex entries to match the leases it finds. · The "update" function manipulates rindex entries directly and should not normally be used by the application. In normal usage, the create and delete functions manipulate rindex entries. Update is mainly useful for testing or repairs. Expiration · If a host fails to renew its delta lease, e.g. it looses access to the storage, its delta lease will eventually expire and another host will be able to take over any resource leases held by the host. san‐ lock must ensure that the application on two different hosts is not holding and using the same lease concurrently. · When sanlock has failed to renew a delta lease for a period of time, it will begin taking measures to stop local processes (applications) from using any resource leases associated with the expiring lockspace delta lease. sanlock enters this "recovery mode" well ahead of the time when another host could take over the locally owned leases. sanlock must have sufficient time to stop all local processes that are using the expiring leases. · sanlock uses three methods to stop local processes that are using expiring leases: 1. Graceful shutdown. sanlock will execute a "graceful shutdown" program that the application previously specified for this case. The shutdown program tells the application to shut down because its leases are expiring. The application must respond by stopping its activities and releasing its leases (or exit). If an application does not specify a graceful shutdown program, sanlock sends SIGTERM to the process instead. The process must release its leases or exit in a prescribed amount of time (see -g), or sanlock proceeds to the next method of stopping. 2. Forced shutdown. sanlock will send SIGKILL to processes using the expiring leases. The processes have a fixed amount of time to exit after receiving SIGKILL. If any do not exit in this time, sanlock will proceed to the next method. 3. Host reset. sanlock will trigger the host's watchdog device to forcibly reset it. sanlock carefully manages the timing of the watchdog device so that it fires shortly before any other host could take over the resource leases held by local processes. Failures If a process holding resource leases fails or exits without releasing its leases, sanlock will release the leases for it automatically (unless persistent resource leases were used.) If the sanlock daemon cannot renew a lockspace delta lease for a spe‐ cific period of time (see Expiration), sanlock will enter "recovery mode" where it attempts to stop and/or kill any processes holding resource leases in the expiring lockspace. If the processes do not exit in time, sanlock will force the host to be reset using the local watchdog device. If the sanlock daemon crashes or hangs, it will not renew the expiry time of the per-lockspace connections it had to the wdmd daemon. This will lead to the expiration of the local watchdog device, and the host will be reset. Watchdog sanlock uses the wdmd(8) daemon to access /dev/watchdog. wdmd multi‐ plexes multiple timeouts onto the single watchdog timer. This is required because delta leases for each lockspace are renewed and expire independently. sanlock maintains a wdmd connection for each lockspace delta lease being renewed. Each connection has an expiry time for some seconds in the future. After each successful delta lease renewal, the expiry time is renewed for the associated wdmd connection. If wdmd finds any con‐ nection expired, it will not renew the /dev/watchdog timer. Given enough successive failed renewals, the watchdog device will fire and reset the host. (Given the multiplexing nature of wdmd, shorter over‐ lapping renewal failures from multiple lockspaces could cause spurious watchdog firing.) The direct link between delta lease renewals and watchdog renewals pro‐ vides a predictable watchdog firing time based on delta lease renewal timestamps that are visible from other hosts. sanlock knows the time the watchdog on another host has fired based on the delta lease time. Furthermore, if the watchdog device on another host fails to fire when it should, the continuation of delta lease renewals from the other host will make this evident and prevent leases from being taken from the failed host. If sanlock is able to stop/kill all processing using an expiring lockspace, the associated wdmd connection for that lockspace is removed. The expired wdmd connection will no longer block /dev/watch‐ dog renewals, and the host should avoid being reset. Storage The sector size and the align size should be specified when creating lockspaces and resources (and rindex). The "align size" is the size on disk of a lockspace or a resource, i.e. the amount of disk space it uses. Lockspaces and resources should use matching sector and align sizes, and must use offsets in multiples of the align size. The max number of hosts that can use a lockspace or resource depends on the combination of sector size and align size, shown below. The host_id of hosts using the lockspace can be no larger than the max_hosts value for the lockspace. Accepted combinations of sector size and align size, and the corre‐ sponding max_hosts (and max host_id) are: sector_size 512, align_size 1M, max_hosts 2000 sector_size 4096, align_size 1M, max_hosts 250 sector_size 4096, align_size 2M, max_hosts 500 sector_size 4096, align_size 4M, max_hosts 1000 sector_size 4096, align_size 8M, max_hosts 2000 When sector_size and align_size are not specified, the behavior matches the behavior before these sizes could be configured: on devices which report sector size 512, 512/1M/2000 is used, on devices which report sector size 4096, 4096/8M/2000 is used, and on files, 512/1M/2000 is always used. (Other combinations are not compatible with sanlock ver‐ sion 3.6 or earlier.) Using sanlock on shared block devices that do host based mirroring or replication is not likely to work correctly. When using sanlock on shared files, all sanlock io should go to one file server. Example This is an example of creating and using lockspaces and resources from the command line. (Most applications would use sanlock through libsan‐ lock rather than through the command line.) 1. Allocate shared storage for sanlock leases. This example assumes 512 byte sectors on the device, in which case the lockspace needs 1MB and each resource needs 1MB. The example shared block device accessible to all hosts is /dev/leases. 2. Start sanlock on all hosts. The -w 0 disables use of the watchdog for testing. # sanlock daemon -w 0 3. Start a dummy application on all hosts. This sanlock command registers with sanlock, then execs the sleep command which inherits the registered fd. The sleep process acts as the dummy application. Because the sleep process is registered with sanlock, leases can be acquired for it. # sanlock client command -c /bin/sleep 600 & 4. Create a lockspace for the application (from one host). The lockspace is named "test". # sanlock client init -s test:0:/dev/leases:0 5. Join the lockspace for the application. Use a unique host_id on each host. host1: # sanlock client add_lockspace -s test:1:/dev/leases:0 host2: # sanlock client add_lockspace -s test:2:/dev/leases:0 6. Create two resources for the application (from one host). The resources are named "RA" and "RB". Offsets are used on the same device as the lockspace. Different LVs or files could also be used. # sanlock client init -r test:RA:/dev/leases:1048576 # sanlock client init -r test:RB:/dev/leases:2097152 7. Acquire resource leases for the application on host1. Acquire an exclusive lease (the default) on the first resource, and a shared lease (SH) on the second resource. # export P=`pidof sleep` # sanlock client acquire -r test:RA:/dev/leases:1048576 -p $P # sanlock client acquire -r test:RB:/dev/leases:2097152:SH -p $P 8. Acquire resource leases for the application on host2. Acquiring the exclusive lease on the first resource will fail because it is held by host1. Acquiring the shared lease on the second resource will succeed. # export P=`pidof sleep` # sanlock client acquire -r test:RA:/dev/leases:1048576 -p $P # sanlock client acquire -r test:RB:/dev/leases:2097152:SH -p $P 9. Release resource leases for the application on both hosts. The sleep pid could also be killed, which will result in the san‐ lock daemon releasing its leases when it exits. # sanlock client release -r test:RA:/dev/leases:1048576 -p $P # sanlock client release -r test:RB:/dev/leases:2097152 -p $P 10. Leave the lockspace for the application. host1: # sanlock client rem_lockspace -s test:1:/dev/leases:0 host2: # sanlock client rem_lockspace -s test:2:/dev/leases:0 11. Stop sanlock on all hosts. # sanlock shutdown OPTIONS COMMAND can be one of three primary top level choices sanlock daemon start daemon sanlock client send request to daemon (default command if none given) sanlock direct access storage directly (no coordination with daemon) Daemon Command sanlock daemon [options] -D no fork and print all logging to stderr -Q 0|1 quiet error messages for common lock contention -R 0|1 renewal debugging, log debug info for each renewal -L pri write logging at priority level and up to logfile (-1 none) -S pri write logging at priority level and up to syslog (-1 none) -U uid user id -G gid group id -t num max worker threads -g sec seconds for graceful recovery -w 0|1 use watchdog through wdmd -h 0|1 use high priority (RR) scheduling -l num use mlockall (0 none, 1 current, 2 current and future) -b sec seconds a host id bit will remain set in delta lease bitmap -e str local host name used in delta leases Client Command sanlock client action [options] sanlock client status Print processes, lockspaces, and resources being managed by the sanlock daemon. Add -D to show extra internal daemon status for debugging. Add -o p to show resources by pid, or -o s to show resources by lockspace. sanlock client host_status Print state of host_id delta leases read during the last renewal. State of all lockspaces is shown (use -s to select one). Add -D to show extra internal daemon status for debugging. sanlock client gets Print lockspaces being managed by the sanlock daemon. The LOCKSPACE string will be followed by ADD or REM if the lockspace is currently being added or removed. Add -h 1 to also show hosts in each lockspace. sanlock client renewal -s LOCKSPACE Print a history of renewals with timing details. See the Renewal his‐ tory section below. sanlock client log_dump Print the sanlock daemon internal debug log. sanlock client shutdown Ask the sanlock daemon to exit. Without the force option (-f 0), the command will be ignored if any lockspaces exist. With the force option (-f 1), any registered processes will be killed, their resource leases released, and lockspaces removed. With the wait option (-w 1), the command will wait for a result from the daemon indicating that it has shut down and is exiting, or cannot shut down because lockspaces exist (command fails). sanlock client init -s LOCKSPACE Tell the sanlock daemon to initialize a lockspace on disk. The -o option can be used to specify the io timeout to be written in the host_id leases. The -Z and -A options can be used to specify the sec‐ tor size and align size, and both should be set together. (Also see sanlock direct init.) sanlock client init -r RESOURCE Tell the sanlock daemon to initialize a resource lease on disk. The -Z and -A options can be used to specify the sector size and align size, and both should be set together. (Also see sanlock direct init.) sanlock client read -s LOCKSPACE Tell the sanlock daemon to read a lockspace from disk. Only the LOCKSPACE path and offset are required. If host_id is zero, the first record at offset (host_id 1) is used. The complete LOCKSPACE is printed. Add -D to print other details. (Also see sanlock direct read_leader.) sanlock client read -r RESOURCE Tell the sanlock daemon to read a resource lease from disk. Only the RESOURCE path and offset are required. The complete RESOURCE is printed. Add -D to print other details. (Also see sanlock direct read_leader.) sanlock client add_lockspace -s LOCKSPACE Tell the sanlock daemon to acquire the specified host_id in the lockspace. This will allow resources to be acquired in the lockspace. The -o option can be used to specify the io timeout of the acquiring host, and will be written in the host_id lease. sanlock client inq_lockspace -s LOCKSPACE Inquire about the state of the lockspace in the sanlock daemon, whether it is being added or removed, or is joined. sanlock client rem_lockspace -s LOCKSPACE Tell the sanlock daemon to release the specified host_id in the lockspace. Any processes holding resource leases in this lockspace will be killed, and the resource leases not released. sanlock client command -r RESOURCE -c path args Register with the sanlock daemon, acquire the specified resource lease, and exec the command at path with args. When the command exits, the sanlock daemon will release the lease. -c must be the final option. sanlock client acquire -r RESOURCE -p pid sanlock client release -r RESOURCE -p pid Tell the sanlock daemon to acquire or release the specified resource lease for the given pid. The pid must be registered with the sanlock daemon. acquire can optionally take a versioned RESOURCE string RESOURCE:lver, where lver is the version of the lease that must be acquired, or fail. sanlock client convert -r RESOURCE -p pid Tell the sanlock daemon to convert the mode of the specified resource lease for the given pid. If the existing mode is exclusive (default), the mode of the lease can be converted to shared with RESOURCE:SH. If the existing mode is shared, the mode of the lease can be converted to exclusive with RESOURCE (no :SH suffix). sanlock client inquire -p pid Print the resource leases held the given pid. The format is a ver‐ sioned RESOURCE string "RESOURCE:lver" where lver is the version of the lease held. sanlock client request -r RESOURCE -f force_mode Request the owner of a resource do something specified by force_mode. A versioned RESOURCE:lver string must be used with a greater version than is presently held. Zero lver and force_mode clears the request. sanlock client examine -r RESOURCE Examine the request record for the currently held resource lease and carry out the action specified by the requested force_mode. sanlock client examine -s LOCKSPACE Examine requests for all resource leases currently held in the named lockspace. Only lockspace_name is used from the LOCKSPACE argument. sanlock client set_event -s LOCKSPACE -i host_id -g gen -e num -d num Set an event for another host. When the sanlock daemon next renews its delta lease for the lockspace it will: set the bit for the host_id in its bitmap, and set the generation, event and data values in its own delta lease. An application that has registered for events from this lockspace on the destination host will get the event that has been set when the destination sees the event during its next delta lease renewal. sanlock client set_config -s LOCKSPACE Set a configuration value for a lockspace. Only lockspace_name is used from the LOCKSPACE argument. The USED flag has the same effect on a lockspace as a process holding a resource lease that will not exit. The USED_BY_ORPHANS flag means that an orphan resource lease will have the same effect as the USED. -u 0|1 Set (1) or clear (0) the USED flag. -O 0|1 Set (1) or clear (0) the USED_BY_ORPHANS flag. sanlock client format -x RINDEX Create a resource index on disk. Use -Z and -A to set the sector size and align size to match the lockspace. sanlock client create -x RINDEX -e resource_name Create a new resource lease on disk, using the rindex to find a free offset. sanlock client delete -x RINDEX -e resource_name[:offset] Delete an existing resource lease on disk. sanlock client lookup -x RINDEX -e resource_name Look up the offset of an existing resource lease by name on disk, using the rindex. With no -e option, lookup returns the next free lease off‐ set. If -e specifes both name and offset, the lookup verifies both are correct. sanlock client update -x RINDEX -e resource_name[:offset] [-z 0|1] Add (-z 0) or remove (-z 1) an rindex entry on disk. sanlock client rebuild -x RINDEX Rebuild the rindex entries by scanning the disk for resource leases. Direct Command sanlock direct action [options] -o sec io timeout in seconds sanlock direct init -s LOCKSPACE sanlock direct init -r RESOURCE Initialize storage for a lockspace or resource. Use the -Z and -A flags to specify the sector size and align size. The max hosts that can use the lockspace/resource (and the max possible host_id) is deter‐ mined by the sector/align size combination. Possible combinations are: 512/1M, 4096/1M, 4096/2M, 4096/4M, 4096/8M. Lockspaces and resources both use the same amount of space (align_size) for each combination. When initializing a lockspace, sanlock initializes delta leases for max_hosts in the given space. When initializing a resource, sanlock initializes a single paxos lease in the space. With -s, the -o option specifies the io timeout to be written in the host_id leases. With -r, the -z 1 option invalidates the resource lease on disk so it cannot be used until reinitialized normally. sanlock direct read_leader -s LOCKSPACE sanlock direct read_leader -r RESOURCE Read a leader record from disk and print the fields. The leader record is the single sector of a delta lease, or the first sector of a paxos lease. sanlock direct dump path[:offset[:size]] Read disk sectors and print leader records for delta or paxos leases. Add -f 1 to print the request record values for paxos leases, host_ids set in delta lease bitmaps, and rindex entries. sanlock direct format -x RINDEX sanlock direct lookup -x RINDEX -e resource_name sanlock direct update -x RINDEX -e resource_name[:offset] [-z 0|1] sanlock direct rebuild -x RINDEX Access the resource index on disk without going through the sanlock daemon. This precludes using the internal paxos lease to protect rindex modifications. See client equivalents for descriptions. LOCKSPACE option string -s lockspace_name:host_id:path:offset lockspace_name name of lockspace host_id local host identifier in lockspace path path to storage to use for leases offset offset on path (bytes) RESOURCE option string -r lockspace_name:resource_name:path:offset lockspace_name name of lockspace resource_name name of resource path path to storage to use leases offset offset on path (bytes) RESOURCE option string with suffix -r lockspace_name:resource_name:path:offset:lver lver leader version -r lockspace_name:resource_name:path:offset:SH SH indicates shared mode RINDEX option string -x lockspace_name:path:offset lockspace_name name of lockspace path path to storage to use for leases offset offset on path (bytes) of rindex Defaults sanlock help shows the default values for the options above. sanlock version shows the build version. OTHER Request/Examine The first part of making a request for a resource is writing the request record of the resource (the sector following the leader record). To make a successful request: · RESOURCE:lver must be greater than the lver presently held by the other host. This implies the leader record must be read to discover the lver, prior to making a request. · RESOURCE:lver must be greater than or equal to the lver presently written to the request record. Two hosts may write a new request at the same time for the same lver, in which case both would succeed, but the force_mode from the last would win. · The force_mode must be greater than zero. · To unconditionally clear the request record (set both lver and force_mode to 0), make request with RESOURCE:0 and force_mode 0. The owner of the requested resource will not know of the request unless it is explicitly told to examine its resources via the "examine" api/command, or otherwise notfied. The second part of making a request is notifying the resource lease owner that it should examine the request records of its resource leases. The notification will cause the lease owner to automatically run the equivalent of "sanlock client examine -s LOCKSPACE" for the lockspace of the requested resource. The notification is made using a bitmap in each host_id delta lease. Each bit represents each of the possible host_ids (1-2000). If host A wants to notify host B to examine its resources, A sets the bit in its own bitmap that corresponds to the host_id of B. When B next renews its delta lease, it reads the delta leases for all hosts and checks each bitmap to see if its own host_id has been set. It finds the bit for its own host_id set in A's bitmap, and examines its resource request records. (The bit remains set in A's bitmap for set_bit‐ map_seconds.) force_mode determines the action the resource lease owner should take: · FORCE (1): kill the process holding the resource lease. When the process has exited, the resource lease will be released, and can then be acquired by anyone. The kill signal is SIGKILL (or SIGTERM if SIGKILL is restricted.) · GRACEFUL (2): run the program configured by sanlock_killpath against the process holding the resource lease. If no killpath is defined, then FORCE is used. Persistent and orphan resource leases A resource lease can be acquired with the PERSISTENT flag (-P 1). If the process holding the lease exits, the lease will not be released, but kept on an orphan list. Another local process can acquire an orphan lease using the ORPHAN flag (-O 1), or release the orphan lease using the ORPHAN flag (-O 1). All orphan leases can be released by setting the lockspace name (-s lockspace_name) with no resource name. Renewal history sanlock saves a limited history of lease renewal information in each lockspace. See sanlock.conf renewal_history_size to set the amount of history or to disable (set to 0). IO times are measured in delta lease renewal (each delta lease renewal includes one read and one write). For each successful renewal, a record is saved that includes: · the timestamp written in the delta lease by the renewal · the time in milliseconds taken by the delta lease read · the time in milliseconds taken by the delta lease write Also counted and recorded are the number io timeouts and other io errors that occur between successful renewals. Two consecutive successful renewals would be recorded as: timestamp=5332 read_ms=482 write_ms=5525 next_timeouts=0 next_errors=0 timestamp=5353 read_ms=99 write_ms=3161 next_timeouts=0 next_errors=0 Those fields are: · timestamp is the value written into the delta lease during that renewal. · read_ms/write_ms are the milliseconds taken for the renewal read/write ios. · next_timeouts are the number of io timeouts that occured after the renewal recorded on that line, and before the next successful renewal on the following line. · next_errors are the number of io errors (not timeouts) that occured after renewal recorded on that line, and before the next successful renewal on the following line. The command 'sanlock client renewal -s lockspace_name' reports the full history of renewals saved by sanlock, which by default is 180 records, about 1 hour of history when using a 20 second renewal interval for a 10 second io timeout. INTERNALS Disk Format · This example uses 512 byte sectors. · Each lockspace is 1MB. It holds 2000 delta_leases, one per sector, supporting up to 2000 hosts. · Each paxos_lease is 1MB. It is used as a lease for one resource. · The leader_record structure is used differently by each lease type. · To display all leader_record fields, see sanlock direct read_leader. · A lockspace is often followed on disk by the paxos_leases used within that lockspace, but this layout is not required. · The request_record and host_id bitmap are used for requests/events. · The mode_block contains the SHARED flag indicating a lease is held in the shared mode. · In a lockspace, the host using host_id N writes to a single delta_lease in sector N-1. No other hosts write to this sector. All hosts read all lockspace sectors when renewing their own delta_lease, and are able to monitor renewals of all delta_leases. · In a paxos_lease, each host has a dedicated sector it writes to, con‐ taining its own paxos_dblock and mode_block structures. Its sector is based on its host_id; host_id 1 writes to the dblock/mode_block in sector 2 of the paxos_lease. · The paxos_dblock structures are used by the paxos_lease algorithm, and the result is written to the leader_record. 0x000000 lockspace foo:0:/path:0 (There is no representation on disk of the lockspace in general, only the sequence of specific delta_leases which collectively represent the lockspace.) delta_lease foo:1:/path:0 0x000 0 leader_record (sector 0, for host_id 1) magic: 0x12212010 space_name: foo resource_name: host uuid/name ... host_id bitmap (leader_record + 256) delta_lease foo:2:/path:0 0x200 512 leader_record (sector 1, for host_id 2) magic: 0x12212010 space_name: foo resource_name: host uuid/name ... host_id bitmap (leader_record + 256) delta_lease foo:3:/path:0 0x400 1024 leader_record (sector 2, for host_id 3) magic: 0x12212010 space_name: foo resource_name: host uuid/name ... host_id bitmap (leader_record + 256) delta_lease foo:2000:/path:0 0xF9E00 leader_record (sector 1999, for host_id 2000) magic: 0x12212010 space_name: foo resource_name: host uuid/name ... host_id bitmap (leader_record + 256) 0x100000 paxos_lease foo:example1:/path:1048576 0x000 0 leader_record (sector 0) magic: 0x06152010 space_name: foo resource_name: example1 0x200 512 request_record (sector 1) magic: 0x08292011 0x400 1024 paxos_dblock (sector 2, for host_id 1) 0x480 1152 mode_block (paxos_dblock + 128) 0x600 1536 paxos_dblock (sector 3, for host_id 2) 0x680 1664 mode_block (paxos_dblock + 128) 0x800 2048 paxos_dblock (sector 4, for host_id 3) 0x880 2176 mode_block (paxos_dblock + 128) 0xFA200 paxos_dblock (sector 2001, for host_id 2000) 0xFA280 mode_block (paxos_dblock + 128) 0x200000 paxos_lease foo:example2:/path:2097152 0x000 0 leader_record (sector 0) magic: 0x06152010 space_name: foo resource_name: example2 0x200 512 request_record (sector 1) magic: 0x08292011 0x400 1024 paxos_dblock (sector 2, for host_id 1) 0x480 1152 mode_block (paxos_dblock + 128) 0x600 1536 paxos_dblock (sector 3, for host_id 2) 0x680 1664 mode_block (paxos_dblock + 128) 0x800 2048 paxos_dblock (sector 4, for host_id 3) 0x880 2176 mode_block (paxos_dblock + 128) 0xFA200 paxos_dblock (sector 2001, for host_id 2000) 0xFA280 mode_block (paxos_dblock + 128) Lease ownership Not shown in the leader_record structures above are the owner_id, owner_generation and timestamp fields. These are the fields that define the lease owner. The delta_lease at sector N for host_id N+1 has leader_record.owner_id N+1. The leader_record.owner_generation is incremented each time the delta_lease is acquired. When a delta_lease is acquired, the leader_record.timestamp field is set to the time of the host and the leader_record.resource_name is set to the unique name of the host. When the host renews the delta_lease, it writes a new leader_record.timestamp. When a host releases a delta_lease, it writes zero to leader_record.timestamp. When a host acquires a paxos_lease, it uses the host_id/generation value from the delta_lease it holds in the lockspace. It uses this host_id/generation to identify itself in the paxos_dblock when running the paxos algorithm. The result of the algorithm is the winning host_id/generation - the new owner of the paxos_lease. The winning host_id/generation are written to the paxos_lease leader_record.owner_id and leader_record.owner_generation fields and leader_record.timestamp is set. When a host releases a paxos_lease, it sets leader_record.timestamp to 0. When a paxos_lease is free (leader_record.timestamp is 0), multiple hosts may attempt to acquire it. The paxos algorithm, using the paxos_dblock structures, will select only one of the hosts as the new owner, and that owner is written in the leader_record. The paxos_lease will no longer be free (non-zero timestamp). Other hosts will see this and will not attempt to acquire the paxos_lease until it is free again. If a paxos_lease is owned (non-zero timestamp), but the owner has not renewed its delta_lease for a specific length of time, then the owner value in the paxos_lease becomes expired, and other hosts will use the paxos algorithm to acquire the paxos_lease, and set a new owner. FILES /etc/sanlock/sanlock.conf · quiet_fail = 1 See -Q · debug_renew = 0 See -R · logfile_priority = 4 See -L · logfile_use_utc = 0 Use UTC instead of local time in log messages. · syslog_priority = 3 See -S · names_log_priority = 4 Log resource names at this priority level (uses syslog priority num‐ bers). If this is greater than or equal to logfile_priority, each requested resource name and location is recorded in sanlock.log. · use_watchdog = 1 See -w · high_priority = 1 See -h · mlock_level = 1 See -l · sh_retries = 8 The number of times to try acquiring a paxos lease when acquiring a shared lease when the paxos lease is held by another host acquiring a shared lease. · uname = sanlock See -U · gname = sanlock See -G · our_host_name = See -e · renewal_read_extend_sec = If a renewal read i/o times out, wait this many additional seconds for that read to complete at the start of the subsequent renewal attempt. When not configured, sanlock waits for an additional io_timeout seconds for a previous timed out read to complete. · renewal_history_size = 180 See -H · paxos_debug_all = 0 Include all details in the paxos debug logging. · debug_io = Add debug logging for each i/o. "submit" (no quotes) produces debug output at submission time, "complete" produces debug output at com‐ pletion time, and "submit,complete" (no space) produces both. · max_sectors_kb = | Set to "ignore" (no quotes) to prevent sanlock from checking or changing max_sectors_kb for the lockspace disk when starting a lockspace. Set to "align" (no quotes) to set max_sectors_kb for the lockspace disk to the align size of the lockspace. Set to a number to set a specific number of KB for all lockspace disks. SEE ALSO wdmd(8) 2015-01-23 SANLOCK(8) WDMD(8) System Manager's Manual WDMD(8) NAME wdmd - watchdog multiplexing daemon SYNOPSIS wdmd [OPTIONS] DESCRIPTION This daemon opens /dev/watchdog and allows multiple independent sources to detmermine whether each KEEPALIVE is done. Every test interval (10 seconds), the daemon tests each source. If any test fails, the KEEPALIVE is not done. In a standard configuration, the watchdog timer will reset the system if no KEEPALIVE is done for 60 seconds ("fire timeout"). This means that if a single test fails 5-6 times in row, the watchdog will fire and reset the system. With multiple test sources, fewer separate failures back to back can also cause a reset, e.g. T seconds, P pass, F fail T00: test1 P, test2 P, test3 P: KEEPALIVE done T10: test1 F, test2 F, test3 P: KEEPALIVE skipped T20: test1 F, test2 P, test3 P: KEEPALIVE skipped T30: test1 P, test2 F, test3 P: KEEPALIVE skipped T40: test1 P, test2 P, test3 F: KEEPALIVE skipped T50: test1 F, test2 F, test3 P: KEEPALIVE skipped T60: test1 P, test2 F, test3 P: KEEPALIVE skipped T60: watchdog fires, system resets (Depending on timings, the system may be reset sometime shortly before T60, and the tests at T60 would not be run.) A crucial aspect to the design and function of wdmd is that if any sin‐ gle source does not pass tests for the fire timeout, the watchdog is guaranteed to fire, regardless of whether other sources on the system have passed or failed. A spurious reset due to the combined effects of multiple failing tests as shown above, is an accepted side effect. The wdmd init script will load the softdog module if no other watchdog module has been loaded. wdmd cannot be used on the system with any other program that needs to open /dev/watchdog, e.g. watchdog(8). Test Source: clients Using libwdmd, programs connect to wdmd via a unix socket, and send regular messages to wdmd to update an expiry time for their connection. Every test interval, wdmd will check if the expiry time for a connec‐ tion has been reached. If so, the test for that client fails. Test Source: scripts wdmd will run scripts from a designated directory every test interval. If a script exits with 0, the test is considered a success, otherwise a failure. If a script does not exit by the end of the test interval, it is considered a failure. OPTIONS --version, -V Print version. --help, -h Print usage. --dump, -d Print debug information from the daemon. --probe, -p Print path of functional watchdog device. Exit code 0 indi‐ cates a functional device was found. Exit code 1 indicates a func‐ tional device was not found. -D Enable debugging to stderr and don't fork. -H 0|1 Enable (1) or disable (0) high priority features such as real‐ time scheduling priority and mlockall. -G name Group ownership for the socket. -S 0|1 Enable (1) or disable (0) script tests. -s path Path to scripts dir. -k num Kill unfinished scripts after num seconds. -w path The path to the watchdog device to try first. 2011-08-01 WDMD(8) :: sanlock-3.8.2/VERSION000066400000000000000000000000061371427612200142270ustar00rootroot000000000000003.8.2 sanlock-3.8.2/common.mk000066400000000000000000000012531371427612200150050ustar00rootroot00000000000000export CC ?= cc check = $(shell echo "int main() { return 0; }" \ | $(CC) $(1) -xc - >&/dev/null && echo $(1) ||:) export CFLAGS += -D_GNU_SOURCE -O2 -ggdb \ -Wall \ -Wformat \ -Wformat-security \ -Wmissing-prototypes \ -Wnested-externs \ -Wpointer-arith \ -Wextra \ -Wshadow \ -Wcast-align \ -Wwrite-strings \ -Waggregate-return \ -Wstrict-prototypes \ -Winline \ -Wredundant-decls \ -Wno-sign-compare \ -Wno-unused-parameter \ -Wp,-D_FORTIFY_SOURCE=2 \ -Wno-strict-overflow \ -fexceptions \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option \ -Wp,-D_GLIBCXX_ASSERTIONS \ -fstack-protector-strong \ $(check -fstack-clash-protection) \ -Wl,-z,now sanlock-3.8.2/fence_sanlock/000077500000000000000000000000001371427612200157555ustar00rootroot00000000000000sanlock-3.8.2/fence_sanlock/Makefile000066400000000000000000000027051371427612200174210ustar00rootroot00000000000000TARGET1 = fence_sanlockd TARGET2 = fence_sanlock SOURCE1 = fence_sanlockd.c SOURCE2 = fence_sanlock.in OPTIMIZE_FLAG = -O2 -Wp,-D_FORTIFY_SOURCE=2 ifeq ($(DEBUG), 1) OPTIMIZE_FLAG = -O0 CFLAGS += -g endif CFLAGS += -D_GNU_SOURCE -g \ -Wall \ -Wformat \ -Wformat-security \ -Wnested-externs \ -Wpointer-arith \ -Wextra -Wshadow \ -Wcast-align \ -Wwrite-strings \ -Waggregate-return \ -Wstrict-prototypes \ -Winline \ -Wredundant-decls \ -Wno-sign-compare \ -fexceptions \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option \ $(OPTIMIZE_FLAG) \ $(NULL) VER=$(shell cat ../VERSION) CFLAGS += -DVERSION=\"$(VER)\" -I../src -I../wdmd CFLAGS += -fPIE -DPIE LDFLAGS = -Wl,-z,now -Wl,-z,relro -pie LDADD = -lrt -laio -lblkid -lsanlock -lwdmd all: $(TARGET1) $(TARGET2) $(TARGET1): $(SOURCE1) $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCE1) $(LDADD) -o $@ -L. -L../src -L../wdmd $(TARGET2): $(SOURCE2) cat $(SOURCE2) | sed \ -e 's#@VERSION@#${VER}#g' \ > $(TARGET2) chmod 755 $(TARGET2) clean: rm -f *.o *.so *.so.* $(TARGET1) $(TARGET2) INSTALL=$(shell which install) DESTDIR= BINDIR=/usr/sbin LIBDIR=/usr/lib64 HEADIR=/usr/include MANDIR=/usr/share/man .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 $(INSTALL) -c -m 755 $(TARGET1) $(TARGET2) $(DESTDIR)/$(BINDIR) $(INSTALL) -m 644 fence_sanlock.8 $(DESTDIR)/$(MANDIR)/man8/ $(INSTALL) -m 644 fence_sanlockd.8 $(DESTDIR)/$(MANDIR)/man8/ sanlock-3.8.2/fence_sanlock/fence_sanlock.8000066400000000000000000000200671371427612200206450ustar00rootroot00000000000000.TH FENCE_SANLOCK 8 2013-05-02 .SH NAME fence_sanlock \- fence agent using watchdog and shared storage leases .SH SYNOPSIS .B fence_sanlock [OPTIONS] .SH DESCRIPTION fence_sanlock uses the watchdog device to reset nodes, in conjunction with three daemons: fence_sanlockd, sanlock, and wdmd. The watchdog device, controlled through /dev/watchdog, is available when a watchdog kernel module is loaded. A module should be loaded for the available hardware. If no hardware watchdog is available, or no module is loaded, the "softdog" module will be loaded, which emulates a hardware watchdog device. Shared storage must be configured for sanlock to use from all hosts. This is generally an lvm lv (non-clustered), but could be another block device, or NFS file. The storage should be 1GB of fully allocated space. After being created, the storage must be initialized with the command: .br # fence_sanlock -o sanlock_init -p /path/to/storage The fence_sanlock agent uses sanlock leases on shared storage to verify that hosts have been reset, and to notify fenced nodes that are still running, that they should be reset. The fence_sanlockd init script starts the wdmd, sanlock and fence_sanlockd daemons before the cluster or fencing systems are started (e.g. cman, corosync and fenced). The fence_sanlockd daemon is started with the -w option so it waits for the path and host_id options to be provided when they are available. Unfencing must be configured for fence_sanlock in cluster.conf. The cman init script does unfencing by running fence_node -U, which in turn runs fence_sanlock with the "on" action and local path and host_id values taken from cluster.conf. fence_sanlock in turn passes the path and host_id values to the waiting fence_sanlockd daemon. With these values, fence_sanlockd joins the sanlock lockspace and acquires a resource lease for the local host. It can take several minutes to complete these unfencing steps. Once unfencing is complete, the node is a member of the sanlock lockspace named "fence" and the node's fence_sanlockd process holds a resource lease named "hN", where N is the node's host_id. (To verify this, run the commands "sanlock client status" and "sanlock client host_status", which show state from the sanlock daemon, or "sanlock direct dump " which shows state from shared storage.) When fence_sanlock fences a node, it tries to acquire that node's resource lease. sanlock will not grant the lease until the owner (the node being fenced) has been reset by its watchdog device. The time it takes to acquire the lease is 140 seconds from the victim's last lockspace renewal timestamp on the shared storage. Once acquired, the victim's lease is released, and fencing completes successfully. Live nodes being fenced When a live node is being fenced, fence_sanlock will continually fail to acquire the victim's lease, because the victim continues to renew its lockspace membership on storage, and the fencing node sees it is alive. This is by design. As long as the victim is alive, it must continue to renew its lockspace membership on storage. The victim must not allow the remote fence_sanlock to acquire its lease and consider it fenced while it is still alive. At the same time, a victim knows that when it is being fenced, it should be reset to avoid blocking recovery of the rest of the cluster. To communicate this, fence_sanlock makes a "request" on storage for the victim's resource lease. On the victim, fence_sanlockd, which holds the resource lease, is configured to receive SIGUSR1 from sanlock if anyone requests its lease. Upon receiving the signal, fence_sanlockd knows that it is a fencing victim. In response to this, fence_sanlockd allows its wdmd connection to expire, which in turn causes the watchdog device to fire, resetting the node. The watchdog reset will obviously have the effect of stopping the victim's lockspace membership renewals. Once the renewals stop, fence_sanlock will finally be able to acquire the victim's lease after waiting a fixed time from the final lockspace renewal. Loss of shared storage If access to shared storage with sanlock leases is lost for 80 seconds, sanlock is not able to renew the lockspace membership, and enters recovery. This causes sanlock clients holding leases, such as fence_sanlockd, to be notified that their leases are being lost. In response, fence_sanlockd must reset the node, much as if it was being fenced. Daemons killed/crashed/hung If sanlock, fence_sanlockd daemons are killed abnormally, or crash or hang, their wdmd connections will expire, causing the watchdog device to fire, resetting the node. fence_sanlock from another node will then run and acquire the victim's resource lease. If the wdmd daemon is killed abnormally or crashes or hangs, it will not pet the watchdog device, causing it to fire and reset the node. Time Values The specific times periods referenced above, e.g. 140, 80, are based on the default sanlock i/o timeout of 10 seconds. If sanlock is configured to use a different i/o timeout, these numbers will be different. .SH OPTIONS .BI \-o " action" The agent action: .IP .B on .br Enable the local node to be fenced. Used by unfencing. .IP .B off .br Disable another node. .IP .B status .br Test if a node is on or off. A node is on if it's lease is held, and off is it's lease is free. .IP .B metadata .br Print xml description of required parameters. .IP .B sanlock_init .br Initialize sanlock leases on shared storage. .PP .BI \-p " path" The path to shared storage with sanlock leases. .PP .BI \-i " host_id" The host_id, from 1-128. .SH STDIN PARAMETERS Options can be passed on stdin, with the format key=val. Each key=val pair is separated by a new line. action=on|off|status .br See \-o path=/path/to/shared/storage .br See \-p host_id=num .br See \-i .SH FILES Example cluster.conf configuration for fence_sanlock. .br (For cman based clusters in which fenced runs agents.) .br Also see cluster.conf(5), fenced(8), fence_node(8). .nf .fi .PP Example dlm.conf configuration for fence_sanlock. .br (For non-cman based clusters in which dlm_controld runs agents.) .br Also see dlm.conf(5), dlm_controld(8). .nf device wd /usr/sbin/fence_sanlock path=/dev/fence/leases connect wd node=1 host_id=1 connect wd node=2 host_id=2 unfence wd .fi .SH TEST To test fence_sanlock directly, without clustering: .nf 1. Initialize storage node1: create 1G lv on shared storage /dev/fence/leases node1: fence_sanlock -o sanlock_init -p /dev/fence/leases 2. Start services node1: service fence_sanlockd start node2: service fence_sanlockd start 3. Enable fencing node1: fence_sanlock -o on -p /dev/fence/leases -i 1 node2: fence_sanlock -o on -p /dev/fence/leases -i 2 This "unfence" step may take a couple minutes. 4. Verify hosts and leases node1: sanlock status s fence:1:/dev/fence/leases:0 r fence:h1:/dev/fence/leases:1048576:1 p 2465 node2: sanlock status s fence:2:/dev/fence/leases:0 r fence:h2:/dev/fence/leases:2097152:1 p 2366 node1: sanlock host_status lockspace fence 1 timestamp 717 2 timestamp 678 node2: sanlock host_status lockspace fence 1 timestamp 738 2 timestamp 678 5. Fence node2 node1: fence_sanlock -o off -p /dev/fence/leases -i 2 This may take a few minutes to return. When node2 is not dead before fencing, sanlock on node1 will log errors about failing to acquire the lease while node2 is still alive. This is expected. 6. Success node1 fence_sanlock should exit 0 after node2 is reset by its watchdog. .fi .SH SEE ALSO .BR fence_sanlockd (8), .BR sanlock (8), .BR wdmd (8) sanlock-3.8.2/fence_sanlock/fence_sanlock.in000077500000000000000000000235731371427612200211140ustar00rootroot00000000000000#!/bin/bash # Copyright 2012 Red Hat, Inc. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v2 or (at your option) any later version. # cluster.conf # # # # # # # # # # # # # # # # # # # # # # # # prog=fence_sanlock max_hosts=128 opts= action= path= host_id= offset= help() { echo "Usage:" echo "" echo "$prog [options]" echo "" echo "Options:" echo " -o Action: off (default), on, status or metadata" echo " (sanlock specific actions: sanlock_init)" echo " -p sanlock shared storage for leases" echo " -i sanlock host_id of node to operate on" echo " -h Print this help, then exit" echo " -V Print program version information, then exit" echo "" echo "stdin options:" echo " action=" echo " path=" echo " host_id=" } cli_options() { while [ "$1" != "--" ]; do case $1 in -o) action=$2 shift ;; -p) path=$2 shift ;; -i) host_id=$2 shift ;; -h) help exit 0 ;; -V) echo "$prog version @VERSION@" exit 0 ;; esac shift done } stdin_options() { oldIFS="$IFS" export IFS="=" while read key val; do case "$key" in action) action=$val ;; path) path=$val ;; host_id) host_id=$val ;; esac done export IFS="$oldIFS" } if [ $# -eq 0 ]; then stdin_options else opts=$(getopt n:o:p:i:hV $@) if [ "$?" != 0 ]; then help exit 1 fi cli_options $opts fi metadata() { cat << EOF fence_sanlock is an i/o fencing agent that uses the watchdog device to reset nodes. Shared storage (block or file) is used by sanlock to ensure that fenced nodes are reset, and to notify partitioned nodes that they need to be reset. http://www.redhat.com/ Fencing Action Path to sanlock shared storage Host id for sanlock (1-128) EOF return 0 } read_leader() { # verify storage has been initialized leader=$(sanlock direct read_leader -r fence:h$host_id:$path:$offset 2>&1) [ "$?" != 0 ] && { logger -t $prog "$action: storage error: unable to read $path" return 1 } magic="$(echo "$leader" | grep magic | awk '{print $NF}')" [ -z "$magic" ] && { logger -t $prog "$action: storage error: no sanlock magic number at $path:$offset" return 1 } [ "$magic" != "0x6152010" ] && { logger -t $prog "$action: storage error: invalid sanlock magic number $magic at $path:$offset" return 1 } return 0 } action_on() { read_leader || return 1 [ -z "$(pidof fence_sanlockd)" ] && { logger -t $prog "on fence_sanlockd is not running" return 1 } # send p,i options to fence_sanlockd which is waiting for them errmsg="$(fence_sanlockd -s -p $path -i $host_id 2>&1)" [ "$?" != 0 ] && { logger -t $prog "on fence_sanlockd -s error: $errmsg" return 1 } # wait for fence_sanlockd to acquire the local lease; # it can take minutes, and we can't allow fence_tool join # until this is complete initdone="" while [ -z "$initdone" ]; do # make sure sanlockd is alive clientstatus="$(sanlock client status 2>&1)" [ "$?" != 0 ] && { logger -t $prog "on sanlock client status error $?" return 1 } # make sure fence_sanlockd is alive [ -z "$(pidof fence_sanlockd)" ] && { logger -t $prog "on fence_sanlockd stopped running" return 1 } # FIXME: check that r is really done being acquired? # just appearing in output may not be enough echo "$clientstatus" | grep -q fence:h$host_id:$path:$offset && initdone="yes" sleep 1 done exit 0 } action_off() { read_leader || return 1 owner_id="$(echo "$leader" | grep owner_id | awk '{print $NF}')" owner_gen="$(echo "$leader" | grep owner_gen | awk '{print $NF}')" ver="$(echo "$leader" | grep lver | awk '{print $NF}')" timestamp="$(echo "$leader" | grep ^timestamp | awk '{print $NF}')" # lease is released, so host is off [ "$timestamp" = 0 ] && { return 0 } # owner_id should equal host_id [ "$owner_id" != "$host_id" ] && { logger -t $prog "victim lease $host_id owned by $owner_id:$owner_gen" return 1 } pid="$(pidof fence_sanlockd)" [ -z "$pid" ] && { logger -t $prog "Unable to determine fence_sanlockd pid" return 1 } # pid file should be unique for each instance so multiple # fence_sanlock's can run in parallel. fence_sanlockd may read # this file to see which host_id we are fencing. pidfile=/run/$prog/$prog.pid.$$ echo "$$ host_id $host_id gen $owner_gen ver $ver timestamp $timestamp" > $pidfile logger -t $prog "$$ host_id $host_id gen $owner_gen ver $ver timestamp $timestamp" loop=0 # FIXME: should this loop have a retry limit? while : do loop=$(($loop+1)) tmp_pid="$(pidof fence_sanlockd)" [ -z "$tmp_pid" ] && { logger -t $prog "fence_sanlockd not running" unlink $pidfile return 1 } sanlock client acquire -r fence:h$host_id:$path:$offset -p $pid > /dev/null 2>&1 [ "$?" = 0 ] && { # fence success sanlock client release -r fence:h$host_id:$path:$offset -p $pid > /dev/null 2>&1 [ "$?" != 0 ] && { logger -t $prog "release $host_id error $?" } unlink $pidfile return 0 } if [ "$loop" = 1 ]; then # acquire probably failed because the victim is # still alive and renewing its lease, (we could # verify that by checking the error code, but the # error codes are currently messed up due to # negation). use a request on the victim's lease # to tell it that it's being fenced and needs to # reset. the -f 2 causes SIGUSR1 to be sent to # fence_sanlockd on the victim. # We send SIGUSR2 to our own fence_sanlockd to # tell it that we are fencing someone else. If # fence_sanlockd gets both SIGUSR1 indicating that # someone is fencing it, and it gets SIGUSR2 # indicating that it is fencing someone, it knows # that it's the special situation of two nodes # fencing each other in a two node cluster. In # this case, the low host_id can choose to survive. kill -s SIGUSR2 $pid sanlock client request -r fence:h$host_id:$path:$offset:$((ver + 1)) -f 2 > /dev/null 2>&1 [ "$?" != 0 ] && { loggger -t $prog "request $host_id error $?" } fi sleep 10 # Reread the leader; if the victim's lease has been # reacquired cleanly by the victim host (same host_id, new # generation), we can quit with success read_leader [ "$?" != 0 ] && { unlink $pidfile return 1 } tmp_id="$(echo "$leader" | grep owner_id | awk '{print $NF}')" tmp_gen="$(echo "$leader" | grep owner_gen | awk '{print $NF}')" if [ "$owner_id" -eq "$tmp_id" ] && [ "$owner_gen" -lt "$tmp_gen" ]; then logger -t $prog "victim $owner_id:$owner_gen reacquired lease gen $tmp_gen" unlink $pidfile return 0 fi if [ "$owner_id" -ne "$tmp_id" ]; then logger -t $prog "victim $owner_id:$owner_gen acquired by $tmp_id:$tmp_gen" unlink $pidfile return 1 fi done unlink $pidfile return 0 } action_status() { read_leader || return 1 timestamp="$(echo "$leader" | grep ^timestamp | awk '{print $NF}')" # lease is released, so host is "off" [ "$timestamp" = 0 ] && { echo "Status: OFF" exit 2 } # lease is held, so host is "on" echo "Status: ON" exit 0 } sanlock_init() { # initialize lease path echo -n "Initializing fence sanlock lockspace on $path: " sanlock direct init -s fence:0:$path:0 \ > /dev/null 2>/dev/null || \ { echo "error $?" && return 1; } echo "ok" echo -n "Initializing $max_hosts sanlock host leases on $path: " for host_id in $(seq 1 $max_hosts); do offset=$((host_id * $align)) sanlock direct init -r fence:h$host_id:$path:$offset \ > /dev/null 2>/dev/null || \ { echo "error $? for host $host_id" && return 1; } done echo "ok" return 0 } [ -z "$action" ] && action=off # check actions and options compatibility # all actions beside metadata needs storage [ "$action" != "metadata" ] && { [ -z "$path" ] && { echo "storage path argument required" exit 1 } # all actions beside sanlock_init needs host_id [ "$action" != "sanlock_init" ] && [ -z "$host_id" ] && { echo "host_id argument required" exit 1 } # FIXME: add direct align command to sanlock # align="&(sanlock direct align $path)" align=1048576 } # verify host_id parameter [ -n "$host_id" ] && { if [ "$host_id" -lt 1 ] || [ "$host_id" -gt "$max_hosts" ]; then echo "host_id must be between 1 and $max_hosts" exit 1 fi offset=$((host_id * $align)) } case "$action" in metadata) metadata ;; sanlock_init) sanlock_init ;; on) action_on ;; off) action_off ;; status) action_status ;; *) echo $"Unknown action: $action" exit 1 ;; esac exit $? sanlock-3.8.2/fence_sanlock/fence_sanlockd.8000066400000000000000000000013251371427612200210050ustar00rootroot00000000000000.TH FENCE_SANLOCKD 8 2012-09-26 .SH NAME fence_sanlockd \- daemon for fence_sanlock agent .SH SYNOPSIS .B fence_sanlockd [OPTIONS] .SH DESCRIPTION The fence_sanlockd daemon is used by the fence_sanlock agent. See .BR fence_sanlock (8), for full description. .SH OPTIONS .B \-D Enable debugging to stderr and don't fork. .BI \-p " path" Path to shared storage with sanlock leases. .BI \-i " host_id" Local sanlock host_id (1-128). .B \-w Wait for fence_sanlockd -s to send options (p,i). .B \-s Send options (p,i) to waiting fence_sanlockd -w. .B \-1 Send SIGUSR1 to running fence_sanlockd. .SH SEE ALSO .BR fence_sanlock (8), .BR sanlock (8), .BR wdmd (8), .BR fence_node (8), .BR fenced (8) sanlock-3.8.2/fence_sanlock/fence_sanlockd.c000066400000000000000000000535761371427612200210770ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" #include "sanlock_direct.h" #include "wdmd.h" #define MAX_HOSTS 128 /* keep in sync with fence_sanlock definition */ #define LIVE_INTERVAL 5 #define EXPIRE_INTERVAL 20 #define DAEMON_RUN_DIR "/run/fence_sanlockd" #define AGENT_RUN_DIR "/run/fence_sanlock" static char *prog_name = (char *)"fence_sanlockd"; static int we_are_victim; static int we_are_fencing; static int init_shutdown; static int lockspace_recovery; static int daemon_debug; static int our_host_id; static char lease_path[PATH_MAX]; static struct sanlk_lockspace ls; static struct sanlk_resource *r; static struct sanlk_disk disk; static char rdbuf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; static char lockfile_path[PATH_MAX]; static char fifo_path[PATH_MAX]; static char fifo_line[PATH_MAX]; static char key1[PATH_MAX]; static char key2[PATH_MAX]; static char val1[PATH_MAX]; static char val2[PATH_MAX]; struct client { int used; int fd; void *workfn; void *deadfn; }; #define CLIENT_NALLOC 3 static int client_maxi; static int client_size = 0; static struct client *client = NULL; static struct pollfd *pollfd = NULL; #define log_debug(fmt, args...) \ do { \ if (daemon_debug) \ fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ log_debug(fmt, ##args); \ syslog(LOG_ERR, fmt, ##args); \ } while (0) static uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } static void client_alloc(void) { int i; if (!client) { client = malloc(CLIENT_NALLOC * sizeof(struct client)); pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd)); } else { client = realloc(client, (client_size + CLIENT_NALLOC) * sizeof(struct client)); pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) * sizeof(struct pollfd)); if (!pollfd) log_error("can't alloc for pollfd"); } if (!client || !pollfd) log_error("can't alloc for client array"); for (i = client_size; i < client_size + CLIENT_NALLOC; i++) { memset(&client[i], 0, sizeof(struct client)); client[i].fd = -1; pollfd[i].fd = -1; pollfd[i].revents = 0; } client_size += CLIENT_NALLOC; } static int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)) { int i; if (!client) client_alloc(); again: for (i = 0; i < client_size; i++) { if (!client[i].used) { client[i].used = 1; client[i].workfn = workfn; client[i].deadfn = deadfn; client[i].fd = fd; pollfd[i].fd = fd; pollfd[i].events = POLLIN; if (i > client_maxi) client_maxi = i; return i; } } client_alloc(); goto again; } static int read_lockfile(int *pid) { char buf[16]; int fd, rv; sprintf(lockfile_path, "%s/%s.pid", DAEMON_RUN_DIR, prog_name); fd = open(lockfile_path, O_RDONLY); if (fd < 0) { log_error("lockfile open error %s: %s", lockfile_path, strerror(errno)); return -1; } memset(buf, 0, sizeof(buf)); rv = read(fd, buf, sizeof(buf)); if (rv < 0) { log_error("lockfile read error %s: %s", lockfile_path, strerror(errno)); close(fd); return -1; } *pid = atoi(buf); close(fd); return 0; } static int lockfile(void) { char buf[16]; struct flock lock; mode_t old_umask; int fd, rv; old_umask = umask(0022); rv = mkdir(DAEMON_RUN_DIR, 0775); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } umask(old_umask); sprintf(lockfile_path, "%s/%s.pid", DAEMON_RUN_DIR, prog_name); fd = open(lockfile_path, O_CREAT|O_WRONLY|O_CLOEXEC, 0644); if (fd < 0) { log_error("lockfile open error %s: %s", lockfile_path, strerror(errno)); return -1; } lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; rv = fcntl(fd, F_SETLK, &lock); if (rv < 0) { log_error("lockfile setlk error %s: %s", lockfile_path, strerror(errno)); goto fail; } rv = ftruncate(fd, 0); if (rv < 0) { log_error("lockfile truncate error %s: %s", lockfile_path, strerror(errno)); goto fail; } memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%d\n", getpid()); rv = write(fd, buf, strlen(buf)); if (rv <= 0) { log_error("lockfile write error %s: %s", lockfile_path, strerror(errno)); goto fail; } return fd; fail: close(fd); return -1; } static void process_signals(int ci) { struct signalfd_siginfo fdsi; ssize_t rv; int fd = client[ci].fd; rv = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); if (rv != sizeof(struct signalfd_siginfo)) { return; } log_debug("signal %d from pid %d", fdsi.ssi_signo, fdsi.ssi_pid); if (fdsi.ssi_signo == SIGHUP) { init_shutdown = 1; } if (fdsi.ssi_signo == SIGTERM) { lockspace_recovery = 1; } if (fdsi.ssi_signo == SIGUSR1) { we_are_victim = 1; } if (fdsi.ssi_signo == SIGUSR2) { we_are_fencing = 1; } } static int setup_signals(void) { sigset_t mask; int fd, rv; sigemptyset(&mask); sigaddset(&mask, SIGTERM); sigaddset(&mask, SIGHUP); sigaddset(&mask, SIGUSR1); sigaddset(&mask, SIGUSR2); rv = sigprocmask(SIG_BLOCK, &mask, NULL); if (rv < 0) return rv; fd = signalfd(-1, &mask, 0); if (fd < 0) return -errno; client_add(fd, process_signals, NULL); return 0; } static int wait_options(void) { int fd, rv; snprintf(fifo_path, PATH_MAX-1, "%s/%s.fifo", DAEMON_RUN_DIR, prog_name); rv = mkfifo(fifo_path, (S_IRUSR | S_IWUSR)); if (rv && errno != EEXIST) { log_error("wait_options mkfifo error %d %s", errno, fifo_path); return -1; } fd = open(fifo_path, O_RDONLY|O_CLOEXEC); if (fd < 0) { log_error("wait_options open error %d %s", errno, fifo_path); rv = fd; goto out_unlink; } memset(fifo_line, 0, sizeof(fifo_line)); rv = read(fd, fifo_line, sizeof(fifo_line)); if (rv < 0) { log_error("wait_options read error %d", errno); goto out; } rv = sscanf(fifo_line, "%s %s %s %s", key1, val1, key2, val2); if (rv != 4) { log_error("wait_options scan error %d", rv); rv = -1; goto out; } if (strcmp(key1, "-p") || strcmp(key2, "-i")) { log_error("wait_options args error"); rv = -1; goto out; } strncpy(lease_path, val1, PATH_MAX-1); our_host_id = atoi(val2); if (!our_host_id || our_host_id > MAX_HOSTS) { log_error("wait_options invalid host_id"); rv = -1; goto out; } if (!lease_path[0]) { log_error("wait_options invalid path"); rv = -1; goto out; } log_debug("wait_options -p %s -i %d", lease_path, our_host_id); rv = 0; out: close(fd); out_unlink: unlink(fifo_path); return rv; } static int send_options(void) { int fd, rv; snprintf(fifo_path, PATH_MAX-1, "%s/%s.fifo", DAEMON_RUN_DIR, prog_name); fd = open(fifo_path, O_WRONLY|O_CLOEXEC); if (fd < 0) { fprintf(stderr, "open error %d %s\n", errno, fifo_path); return -1; } memset(fifo_line, 0, sizeof(fifo_line)); snprintf(fifo_line, PATH_MAX-1, "-p %s -i %d", lease_path, our_host_id); rv = write(fd, fifo_line, sizeof(fifo_line)); if (rv < 0) { fprintf(stderr, "write error %d %s\n", errno, fifo_path); } else { rv = 0; } close(fd); return rv; } static int send_signal(int sig) { int rv, pid; openlog("fence_sanlockd-1", LOG_CONS | LOG_PID, LOG_DAEMON); rv = read_lockfile(&pid); if (rv < 0) return rv; rv = kill(pid, sig); if (rv < 0) { log_error("kill sig %d pid %d error %d", sig, pid, errno); } else { syslog(LOG_INFO, "sent signal %d to pid %d", sig, pid); } return rv; } /* * A running fence_sanlock agent has a pid file we can read. * We use this to check what host_id it's fencing, so we can * see if we are the low host_id in a two_node fencing duel. * We also check /proc/ to verify that the agent is * still running (that the pid file isn't stale from the * agent being killed). */ static int check_fence_agent(int *victim_host_id) { DIR *d; FILE *file; struct dirent *de; char path[PATH_MAX]; char rest[512]; char name[512]; int agent_pid, victim_id, rv; int error = -ENOENT; d = opendir(AGENT_RUN_DIR); if (!d) return -1; while ((de = readdir(d))) { if (de->d_name[0] == '.') continue; if (strncmp(de->d_name, "fence_sanlock.pid.", strlen("fence_sanlock.pid."))) continue; agent_pid = 0; victim_id = 0; memset(rest, 0, sizeof(rest)); memset(name, 0, sizeof(name)); log_debug("read %s", de->d_name); /* * read /run/fence_sanlock/fence_sanlock.pid. * to get the pid of fence_sanlock and the victim's host_id * * read /proc/pid/comm to check that the pid from that file * is still running and hasn't been killed * * if both of these checks are successful, then return 0 * with the victim host id * * if any fails, continue to check for another pid file */ memset(path, 0, sizeof(path)); snprintf(path, PATH_MAX-1, "%s/%s", AGENT_RUN_DIR, de->d_name); file = fopen(path, "r"); if (!file) { log_debug("open error %d %s", errno, path); continue; } rv = fscanf(file, "%d host_id %d %[^\n]s\n", &agent_pid, &victim_id, rest); fclose(file); log_debug("agent_pid %d victim %d %s", agent_pid, victim_id, rest); if (rv != 3 || !agent_pid || !victim_id) { log_debug("%s scan file error %d", de->d_name, rv); continue; } memset(path, 0, sizeof(path)); snprintf(path, PATH_MAX-1, "/proc/%d/comm", agent_pid); file = fopen(path, "r"); if (!file) { log_debug("%s open proc error %d %s", de->d_name, errno, path); continue; } rv = fscanf(file, "%s", name); fclose(file); if (rv != 1 || strncmp(name, "fence_sanlock", strlen("fence_sanlock"))) { log_debug("%s scan proc error %d %s", de->d_name, rv, name); continue; } /* * we found a running fence_sanlock process, * return the host_id that it's fencing */ *victim_host_id = victim_id; error = 0; break; } closedir(d); return error; } static void print_usage(void) { printf("Usage:\n"); printf("fence_sanlockd [options]\n"); printf("\n"); printf("Options:\n"); printf(" -D Enable debugging to stderr and don't fork\n"); printf(" -p Path to shared storage with sanlock leases\n"); printf(" -i Local sanlock host_id (1-%d)\n", MAX_HOSTS); printf(" -w Wait for fence_sanlockd -s to send options (p,i)\n"); printf(" -s Send options (p,i) to waiting fence_sanlockd -w\n"); printf(" -1 Send SIGUSR1 to running fence_sanlockd\n"); printf(" -h Print this help, then exit\n"); printf(" -V Print program version information, then exit\n"); } int main(int argc, char *argv[]) { void (*workfn) (int ci); void (*deadfn) (int ci); uint64_t live_time, now; int poll_timeout; int sleep_seconds; int send_opts = 0, wait_opts = 0; int send_sigusr1 = 0; int cont = 1; int optchar; int sock, con, rv, i; int align; int victim_host_id; while (cont) { optchar = getopt(argc, argv, "Dp:i:hVws1"); switch (optchar) { case 'D': daemon_debug = 1; break; case 'p': strcpy(lease_path, optarg); break; case 'i': our_host_id = atoi(optarg); if (our_host_id > MAX_HOSTS) { fprintf(stderr, "invalid host_id %d, use 1-%d\n", our_host_id, MAX_HOSTS); exit(1); } break; case 'w': wait_opts = 1; break; case 's': send_opts = 1; break; case '1': send_sigusr1 = 1; break; case 'h': print_usage(); exit(0); case 'V': printf("fence_sanlockd %s (built %s %s)\n", VERSION, __DATE__, __TIME__); exit(0); case EOF: cont = 0; break; default: fprintf(stderr, "unknown option %c\n", optchar); exit(1); }; } if (send_sigusr1) { rv = send_signal(SIGUSR1); return rv; } if (wait_opts && send_opts) { fprintf(stderr, "-w and -s options cannot be used together\n"); exit(1); } if (!wait_opts && (!our_host_id || !lease_path[0])) { fprintf(stderr, "-i and -p options required\n"); exit(1); } if (send_opts) { rv = send_options(); return rv; } if (!daemon_debug) { if (daemon(0, 0) < 0) { fprintf(stderr, "cannot fork daemon\n"); exit(EXIT_FAILURE); } } openlog(prog_name, LOG_CONS | LOG_PID, LOG_DAEMON); rv = lockfile(); if (rv < 0) goto out; rv = setup_signals(); if (rv < 0) goto out_lockfile; if (wait_opts) { rv = wait_options(); if (rv < 0) goto out_lockfile; } con = wdmd_connect(); if (con < 0) { log_error("wdmd connect error %d", con); goto out_lockfile; } rv = wdmd_register(con, (char *)"fence_sanlockd"); if (rv < 0) { log_error("wdmd register error %d", rv); goto out_lockfile; } rv = wdmd_refcount_set(con); if (rv < 0) { log_error("wdmd refcount error %d", rv); goto out_lockfile; } sock = sanlock_register(); if (sock < 0) { log_error("register error %d", sock); goto out_refcount; } rv = sanlock_killpath(sock, 0, "fence_sanlockd", (char *)"-1"); if (rv < 0) { log_error("killpath error %d", sock); goto out_refcount; } rv = sanlock_restrict(sock, SANLK_RESTRICT_SIGKILL); if (rv < 0) { log_error("restrict error %d", sock); goto out_refcount; } memset(&disk, 0, sizeof(disk)); sprintf(disk.path, "%s", lease_path); align = sanlock_direct_align(&disk); if (align < 0) { log_error("direct_align error %d", align); goto out_refcount; } memset(&ls, 0, sizeof(ls)); sprintf(ls.host_id_disk.path, "%s", lease_path); strcpy(ls.name, "fence"); ls.host_id = our_host_id; log_debug("add_lockspace begin"); rv = sanlock_add_lockspace(&ls, 0); if (rv < 0) { log_error("add_lockspace error %d", rv); goto out_refcount; } log_debug("add_lockspace done %d", rv); /* * If we allowed the lockspace to be cleanly released * while our orphan lock still existed, then another * host could acquire our lease as soon as we release * the lockspace delta lease. */ rv = sanlock_set_config(ls.name, 0, SANLK_CONFIG_USED_BY_ORPHANS, NULL); if (rv < 0) { log_error("set_config error %d", rv); goto out_lockspace; } memset(rdbuf, 0, sizeof(rdbuf)); r = (struct sanlk_resource *)&rdbuf; strcpy(r->lockspace_name, "fence"); sprintf(r->name, "h%d", our_host_id); sprintf(r->disks[0].path, "%s", lease_path); r->disks[0].offset = our_host_id * align; r->num_disks = 1; r->flags = SANLK_RES_PERSISTENT; log_debug("acquire begin"); rv = sanlock_acquire(sock, -1, 0, 1, &r, NULL); if (rv < 0) { log_error("acquire error %d", rv); goto out_lockspace; } log_debug("acquire done %d", rv); /* at this point we can be fenced by someone */ now = monotime(); live_time = now; log_debug("test live %llu", (unsigned long long)now); rv = wdmd_test_live(con, now, now + EXPIRE_INTERVAL); if (rv < 0) { log_error("wdmd_test_live first error %d", rv); goto out_release; } sleep_seconds = live_time + LIVE_INTERVAL - monotime(); poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 500; while (1) { rv = poll(pollfd, client_maxi + 1, poll_timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) { /* not sure */ } for (i = 0; i <= client_maxi; i++) { if (client[i].fd < 0) continue; if (pollfd[i].revents & POLLIN) { workfn = client[i].workfn; if (workfn) workfn(i); } if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { deadfn = client[i].deadfn; if (deadfn) deadfn(i); } } now = monotime(); if (init_shutdown) { /* * FIXME: how to be sure that it's safe for us to shut * down? i.e. nothing is running that needs fencing? * * There are at least two distinct problems: * * 1. stopping when dlm/gfs instances exist in the * kernel, but no userland cluster processes exist, * i.e. they have exited uncleanly, and the node * currently needs fencing. * * 2. stopping when dlm_controld is running, but no * lockspaces currently exist. Point 1 would pass, * but dlm_controld assumes fencing is enabled, and * would allow a new lockspace to be created, without * fencing protection if we are not running. * * For now, have the init script check that: * - /sys/kernel/config/dlm/cluster/ is empty * (dlm_controld is not running) * - /sys/kernel/dlm/ is empty * (lockspaces do not exist in the kernel) * * The init script has to use SIGHUP to stop us instead * of SIGTERM because the sanlock daemon uses SIGTERM to * tell us that the lockspace has failed. */ log_error("shutdown"); rv = wdmd_test_live(con, 0, 0); if (rv < 0) log_error("wdmd_test_live 0 error %d", rv); break; } if (lockspace_recovery) { /* * The sanlock daemon sends SIGTERM when the lockspace * host_id cannot be renewed for a while and it enters * recovery. */ log_error("sanlock renewals failed, our watchdog will fire"); } if (we_are_victim && we_are_fencing) { /* * Automatically resolve two_node fencing duel. * * Two nodes are fencing each other, which happens * in a two_node cluster where each can has quorum * by itself. We pick the low host_id to survive. * * (Might we get another SIGUSR1 callback due to * the request not being cleared right away? Would * that matter here?) * * Note that a global victim_host_id doesn't work * if more than one fence_sanlock is run concurrently, * i.e. we're fencing more than one host at a time. * But, this doesn't matter because this case is * only concerned about two_node fencing duels where * we can only be fencing one other node. */ rv = check_fence_agent(&victim_host_id); if (!rv) { if (our_host_id < victim_host_id) { log_error("fence duel winner, our_host_id %d other %d", our_host_id, victim_host_id); we_are_victim = 0; we_are_fencing = 0; } else { log_error("fence duel loser, our_host_id %d other %d", our_host_id, victim_host_id); we_are_fencing = 0; } } else { log_error("fence duel ignore, agent %d", rv); we_are_fencing = 0; } } if (!we_are_victim && we_are_fencing) { /* * We can start fencing someone before we notice that * we are also being fenced in a duel. So, don't clear * we_are_fencing until fence_sanlock is finished and * removes fence_sanlock.log * * We do this for all fencing, but it's only really * needed for two_node fencing duels where we need * to be aware of when we are fencing. */ rv = check_fence_agent(&victim_host_id); if (rv < 0) { log_debug("fence agent not found %d", rv); we_are_fencing = 0; victim_host_id = 0; } else { log_debug("fence agent running host_id %d", victim_host_id); } } if (we_are_victim) { /* * The sanlock daemon has seen someone request our * lease, which happens when they run fence_sanlock * against us. In response to the request, our sanlock * daemon has sent us SIGUSR1. * * Do not call wdmd_test_live, so wdmd will see our * connection expire, and will quit petting the * watchdog, which will then fire in 60 sec. sanlock * continues renewing its host_id until the machine * dies, and the node doing fencing will then be able * to acquire our lease host_dead_seconds after our * last sanlock renewal. * * TODO: we could eventually attempt to kill/unmount/etc * anything using shared storage, and if that all works, * then we could do a clean shutdown afterward. That would * often not work, because dlm/gfs would be stuck in the * kernel due to failure (cluster partition) that caused * our fencing, and couldn't be forcibly cleaned up. */ log_error("we are being fenced, our watchdog will fire"); } if (!we_are_victim && !lockspace_recovery && (now - live_time >= LIVE_INTERVAL)) { /* * How to pick the expire_time. From the perspective * of fence_sanlockd the expire_time isn't really * important. It should be far enough in the future * so that it's: * - after the next time we're going to call test_live, * because our test_live calls are obviously meant to * keep it from expiring * - more than 10 seconds in the future because of a * current quirk in wdmd, where it pre-emptively * closes the wd 10 seconds before the actual expire * time (see comments in wdmd for reason). So we * want to be sure we renew at least 10 sec before * the last expire time. * * It shouldn't be too long, because when we see we're * being fenced, we'll quit calling test_live, and we * want our watchdog to reset us in a fairly short amount * time after that (this effects how long the fencing node * has to wait.) The longer the expire_time we provide, * the longer it'll take before wdmd sees it expire, quits * petting the wd, and resets us. * * So, if we have set expire_time to 20 sec in the * future, and we renew once every 5 sec, we have two * chances to renew before a pre-emptive close. */ live_time = now; log_debug("test live %llu", (unsigned long long)now); rv = wdmd_test_live(con, now, now + EXPIRE_INTERVAL); if (rv < 0) log_error("wdmd_test_live error %d", rv); } if (we_are_victim || lockspace_recovery || we_are_fencing) { poll_timeout = 10000; } else { sleep_seconds = live_time + LIVE_INTERVAL - monotime(); poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 500; } } out_release: sanlock_release(sock, -1, 0, 1, &r); out_lockspace: sanlock_rem_lockspace(&ls, SANLK_REM_ASYNC); out_refcount: wdmd_refcount_clear(con); out_lockfile: unlink(lockfile_path); out: return rv; } sanlock-3.8.2/init.d/000077500000000000000000000000001371427612200143505ustar00rootroot00000000000000sanlock-3.8.2/init.d/fence_sanlockd000077500000000000000000000071011371427612200172330ustar00rootroot00000000000000#!/bin/bash # # fence_sanlockd - daemon for fence_sanlock agent # # chkconfig: 2345 20 80 # description: starts and stops fence_sanlockd # ### BEGIN INIT INFO # Provides: fence_sanlockd # Required-Start: $time $syslog # Required-Stop: $syslog # Should-Start: # Should-Stop: # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: starts and stops fence_sanlockd # Description: starts and stops fence_sanlockd ### END INIT INFO . /etc/rc.d/init.d/functions prog="fence_sanlockd" agent="fence_sanlock" runfile="/run/$prog/$prog.pid" fifofile="/run/$prog/$prog.fifo" lockfile="/var/lock/subsys/$prog" exec="/usr/sbin/$prog" FENCESANLOCKDOPTS="-w" [ -f /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog start() { [ -x $exec ] || exit 5 # start wdmd and sanlock daemons if they aren't running service wdmd status > /dev/null 2>&1 || service wdmd start service sanlock status > /dev/null 2>&1 || service sanlock start [ ! -d /run/$prog ] && install -d -m 775 /run/$prog [ ! -d /run/$agent ] && install -d -m 775 /run/$agent [ -n "$(which restorecon)" ] && \ [ -x "$(which restorecon)" ] && \ restorecon /run/$prog [ -n "$(which restorecon)" ] && \ [ -x "$(which restorecon)" ] && \ restorecon /run/$agent echo -n $"Starting $prog: " daemon $prog $FENCESANLOCKDOPTS retval=$? echo [ $retval -eq 0 ] && touch $lockfile return $retval } stop() { agent_ps="$(ps ax -o pid,args | grep fence_sanlock | grep -v grep | grep -v fence_sanlockd)" [ -n "$agent_ps" ] && { agent_pid="$(echo $agent_ps | awk '{print $1}')" echo -n "cannot stop while $agent $agent_pid is running" failure; echo return 1 } # Ideally, we'd like a general way to check if anything # needs fencing to continue running, but without that, # check what we know, which is that dlm requires it. if [ -d /sys/kernel/dlm/ ]; then count="$(ls -A /sys/kernel/dlm/ | wc -l)" if [ $count -ne 0 ]; then echo -n "cannot stop while dlm lockspaces exist" failure; echo return 1 fi fi if [ -d /sys/kernel/config/dlm/cluster ]; then # this dir exists while dlm_controld is running echo -n "cannot stop while dlm is running" failure; echo return 1 fi PID=$(pidofproc -p $runfile $prog) # We have to use SIGHUP to mean stop because sanlock # uses SIGTERM to mean that the lockspace failed. echo -n $"Sending stop signal $prog ($PID): " killproc -p $runfile $prog -HUP retval=$? echo if [ $retval -ne 0 ]; then return $retval fi # fence_sanlockd won't see the SIGHUP if it's # still waiting for config from the fifo, so # send invalid config to the fifo to make it fail. if [ -p $fifofile ]; then echo "" > $fifofile fi echo -n $"Waiting for $prog ($PID) to stop:" timeout=10 while checkpid $PID; do sleep 1 timeout=$((timeout - 1)) if [ "$timeout" -le 0 ]; then failure; echo return 1 fi done success; echo rm -f $lockfile # stop wdmd and sanlock daemons if they are running service sanlock status > /dev/null 2>&1 && service sanlock stop service wdmd status > /dev/null 2>&1 && service wdmd stop return $retval } restart() { rh_status_q && stop start } reload() { restart } rh_status() { status $prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 $1 ;; stop) rh_status_q || exit 0 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" exit 2 esac exit $? sanlock-3.8.2/init.d/fence_sanlockd.service000066400000000000000000000004401371427612200206660ustar00rootroot00000000000000[Unit] Description=daemon for fence_sanlock agent After=syslog.target wdmd.service sanlock.service Before=corosync.service [Service] Type=forking ExecStart=/lib/systemd/systemd-fence_sanlockd start ExecStop=/lib/systemd/systemd-fence_sanlockd stop [Install] WantedBy=multi-user.target sanlock-3.8.2/init.d/sanlk-resetd.service000066400000000000000000000003221371427612200203230ustar00rootroot00000000000000[Unit] Description=daemon for host reset After=wdmd.service sanlock.service Requires=wdmd.service sanlock.service [Service] Type=forking ExecStart=/usr/sbin/sanlk-resetd [Install] WantedBy=multi-user.target sanlock-3.8.2/init.d/sanlock000066400000000000000000000037051371427612200157320ustar00rootroot00000000000000#!/bin/bash # # sanlock - SAN-based lock manager # # chkconfig: 2345 97 03 # description: starts and stops sanlock daemon # ### BEGIN INIT INFO # Provides: sanlock # Required-Start: $time $syslog wdmd # Required-Stop: $syslog # Should-Start: # Should-Stop: # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: starts and stops sanlock daemon # Description: starts and stops sanlock daemon ### END INIT INFO . /etc/rc.d/init.d/functions prog="sanlock" runfile="/run/$prog/$prog.pid" lockfile="/var/lock/subsys/$prog" exec="/usr/sbin/$prog" SANLOCKUSER="sanlock" SANLOCKOPTS="-U $SANLOCKUSER -G $SANLOCKUSER" [ -f /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog start() { [ -x $exec ] || exit 5 if [ ! -d /run/$prog ]; then install -d -o $SANLOCKUSER -g $SANLOCKUSER -m 775 /run/$prog [ -x /sbin/restorecon ] && restorecon /run/$prog fi echo -n $"Starting $prog: " daemon $prog daemon $SANLOCKOPTS retval=$? echo [ $retval -eq 0 ] && touch $lockfile return $retval } stop() { PID=$(pidofproc -p $runfile $prog) echo -n $"Sending stop signal $prog ($PID): " killproc -p $runfile $prog -TERM retval=$? echo if [ $retval -ne 0 ]; then return $retval fi echo -n $"Waiting for $prog ($PID) to stop:" timeout=10 while checkpid $PID; do sleep 1 timeout=$((timeout - 1)) if [ "$timeout" -le 0 ]; then failure; echo return 1 fi done success; echo rm -f $lockfile return $retval } restart() { rh_status_q && stop start } reload() { restart } rh_status() { status $prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 $1 ;; stop) rh_status_q || exit 0 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" exit 2 esac exit $? sanlock-3.8.2/init.d/sanlock.service000066400000000000000000000003561371427612200173700ustar00rootroot00000000000000[Unit] Description=Shared Storage Lease Manager After=syslog.target Wants=wdmd.service [Service] Type=forking ExecStart=/lib/systemd/systemd-sanlock start ExecStop=/lib/systemd/systemd-sanlock stop [Install] WantedBy=multi-user.target sanlock-3.8.2/init.d/sanlock.service.native000066400000000000000000000003311371427612200206460ustar00rootroot00000000000000[Unit] Description=Shared Storage Lease Manager After=syslog.target Wants=wdmd.service [Service] Type=forking ExecStart=/usr/sbin/sanlock daemon SendSIGKILL=no LimitNOFILE=2048 [Install] WantedBy=multi-user.target sanlock-3.8.2/init.d/sanlock.sysconfig000066400000000000000000000006671371427612200177410ustar00rootroot00000000000000# SANLOCKOPTS -- set the command line options for the sanlock daemon # See sanlock man page for full list of command line options. # # Include "-U sanlock -G sanlock" in the option string unless # also changing the SANLOCKUSER above. # # To disable use of watchdog via wdmd #SANLOCKOPTS="-U sanlock -G sanlock -w 0" # # To disable use of watchdog via wdmd and disable high priority features #SANLOCKOPTS="-U sanlock -G sanlock -w 0 -h 0" sanlock-3.8.2/init.d/wdmd000066400000000000000000000044371371427612200152360ustar00rootroot00000000000000#!/bin/bash # # wdmd - watchdog multiplexing daemon # # chkconfig: 2345 97 03 # description: starts and stops wdmd daemon # ### BEGIN INIT INFO # Provides: wdmd # Required-Start: $time $syslog # Required-Stop: $syslog # Should-Start: # Should-Stop: # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 # Short-Description: starts and stops wdmd daemon # Description: starts and stops wdmd daemon ### END INIT INFO . /etc/rc.d/init.d/functions prog="wdmd" runfile="/run/$prog/$prog.pid" lockfile="/var/lock/subsys/$prog" exec="/usr/sbin/$prog" WDMDGROUP="sanlock" WDMDOPTS="-G $WDMDGROUP" [ -f /etc/sysconfig/$prog ] && . /etc/sysconfig/$prog watchdog_probe() { $exec --probe > /dev/null 2>&1 retval=$? return $retval } watchdog_check() { watchdog_probe retval=$? if [ $retval -ne 0 ]; then echo -n $"Loading the softdog kernel module: " modprobe softdog && udevadm settle watchdog_probe retval=$? if [ $retval -ne 0 ]; then failure; echo return 1 fi success; echo fi } start() { watchdog_check [ -x $exec ] || exit 5 if [ ! -d /run/$prog ]; then install -d -g $WDMDGROUP -m 775 /run/$prog [ -x /sbin/restorecon ] && restorecon /run/$prog fi echo -n $"Starting $prog: " daemon $prog $WDMDOPTS retval=$? echo [ $retval -eq 0 ] && touch $lockfile return $retval } stop() { PID=$(pidofproc -p $runfile $prog) echo -n $"Sending stop signal $prog ($PID): " killproc -p $runfile $prog -TERM retval=$? echo if [ $retval -ne 0 ]; then return $retval fi echo -n $"Waiting for $prog ($PID) to stop:" timeout=10 while checkpid $PID; do sleep 1 timeout=$((timeout - 1)) if [ "$timeout" -le 0 ]; then failure; echo return 1 fi done success; echo rm -f $lockfile return $retval } restart() { rh_status_q && stop start } reload() { restart } rh_status() { status $prog } rh_status_q() { rh_status >/dev/null 2>&1 } case "$1" in start) rh_status_q && exit 0 $1 ;; stop) rh_status_q || exit 0 $1 ;; restart) $1 ;; reload) rh_status_q || exit 7 $1 ;; watchdog-check) watchdog_check ;; force-reload) force_reload ;; status) rh_status ;; condrestart|try-restart) rh_status_q || exit 0 restart ;; *) echo $"Usage $0 {start|stop|status|restart|condrestart|try-restart|reload|force-reload}" exit 2 esac exit $? sanlock-3.8.2/init.d/wdmd.service000066400000000000000000000003251371427612200166650ustar00rootroot00000000000000[Unit] Description=Watchdog Multiplexing Daemon After=syslog.target [Service] Type=forking ExecStart=/lib/systemd/systemd-wdmd start ExecStop=/lib/systemd/systemd-wdmd stop [Install] WantedBy=multi-user.target sanlock-3.8.2/init.d/wdmd.service.native000066400000000000000000000003411371427612200201500ustar00rootroot00000000000000[Unit] Description=Watchdog Multiplexing Daemon After=syslog.target [Service] Type=forking ExecStartPre=/lib/systemd/systemd-wdmd watchdog-check ExecStart=/usr/sbin/wdmd SendSIGKILL=no [Install] WantedBy=multi-user.target sanlock-3.8.2/init.d/wdmd.sysconfig000066400000000000000000000004721371427612200172340ustar00rootroot00000000000000# WDMDOPTS -- set the command line options for the wdmd daemon # See wdmd man page for full list of command line options. # # Include "-G sanlock" in the option string. # # To enable use of test scripts #WDMDOPTS="-G sanlock -S 1" # # To select a specific watchdog device #WDMDOPTS="-G sanlock -w /dev/watchdog1" sanlock-3.8.2/python/000077500000000000000000000000001371427612200145045ustar00rootroot00000000000000sanlock-3.8.2/python/Makefile000066400000000000000000000005711371427612200161470ustar00rootroot00000000000000# Copyright 2010-2011 Red Hat, Inc. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v.2. PYTHON := python$(PY_VERSION) all: $(PYTHON) setup.py build $(BUILDARGS) install: $(PYTHON) setup.py install --root=$(DESTDIR) clean: rm -rf build sanlock-3.8.2/python/example.py000066400000000000000000000052201371427612200165100ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. from __future__ import print_function import pwd import grp import os import time import signal import tempfile import sanlock HOST_ID = 1 LOCKSPACE_NAME = "lockspace1" RESOURCE_NAME = "resource1" def sigTermHandler(): print("SIGTERM signal received") def main(): signal.signal(signal.SIGTERM, sigTermHandler) print("Creating the sanlock disk") fd, disk = tempfile.mkstemp() os.close(fd) os.chown( disk, pwd.getpwnam("sanlock").pw_uid, grp.getgrnam("sanlock").gr_gid) offset = sanlock.get_alignment(disk) SNLK_DISKS = [(disk, offset)] print("Registering to sanlock") fd = sanlock.register() print("Initializing '%s'" % (LOCKSPACE_NAME,)) sanlock.write_lockspace(LOCKSPACE_NAME, disk, align=1048576, sector=512) print("Initializing '%s' on '%s'" % (RESOURCE_NAME, LOCKSPACE_NAME)) sanlock.write_resource( LOCKSPACE_NAME, RESOURCE_NAME, SNLK_DISKS, align=1048576, sector=512) print("Acquiring the id '%i' on '%s'" % (HOST_ID, LOCKSPACE_NAME)) sanlock.add_lockspace(LOCKSPACE_NAME, HOST_ID, disk) try: print("Acquiring '%s' on '%s'" % (RESOURCE_NAME, LOCKSPACE_NAME)) sanlock.acquire( LOCKSPACE_NAME, RESOURCE_NAME, SNLK_DISKS, slkfd=fd, version=0) while True: print("Trying to get lockspace '%s' hosts" % LOCKSPACE_NAME) try: hosts_list = sanlock.get_hosts(LOCKSPACE_NAME) except sanlock.SanlockException as e: if e.errno != os.errno.EAGAIN: raise else: print("Lockspace '%s' hosts: " % LOCKSPACE_NAME, hosts_list) break time.sleep(5) owners = sanlock.read_resource_owners( LOCKSPACE_NAME, RESOURCE_NAME, SNLK_DISKS, align=1048576, sector=512) print("Resource '%s' owners: %s" % (RESOURCE_NAME, owners)) print("Releasing '%s' on '%s'" % (RESOURCE_NAME, LOCKSPACE_NAME)) sanlock.release(LOCKSPACE_NAME, RESOURCE_NAME, SNLK_DISKS, slkfd=fd) except Exception as e: print("Exception: ", e) finally: print("Releasing the id '%i' on '%s'" % (HOST_ID, LOCKSPACE_NAME)) sanlock.rem_lockspace(LOCKSPACE_NAME, HOST_ID, disk) print("Removing the sanlock disk") os.remove(disk) if __name__ == '__main__': main() sanlock-3.8.2/python/sanlock.c000066400000000000000000001535671371427612200163230ustar00rootroot00000000000000/* * Copyright 2010-2019 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #ifndef __unused #define __unused __attribute__ ((unused)) #endif #define MODULE_NAME "sanlock" #define BIND_ERROR -1000 /* Functions prototypes */ static void set_sanlock_error(int en, char *msg); static int parse_disks(PyObject *obj, struct sanlk_resource **res_ret); static void set_error(PyObject *exception, const char* format, PyObject* obj); /* Sanlock module */ PyDoc_STRVAR(pydoc_sanlock, "\ Copyright (C) 2010-2019 Red Hat, Inc.\n\ This copyrighted material is made available to anyone wishing to use,\n\ modify, copy, or redistribute it subject to the terms and conditions\n\ of the GNU General Public License v2 or (at your option) any later version."); /* Sanlock exception */ static PyObject *py_exception; static void set_sanlock_error(int en, char *msg) { const char *err_name; PyObject *exc_tuple; if (en < 0 && en > -200) { en = -en; err_name = strerror(en); } else { /* Safe to call without releasing the GIL. */ err_name = sanlock_strerror(en); } exc_tuple = Py_BuildValue("(iss)", en, msg, err_name); if (exc_tuple == NULL) { PyErr_NoMemory(); } else { PyErr_SetObject(py_exception, exc_tuple); Py_DECREF(exc_tuple); } } /* * Converts a unicode path into PyBytes object. * If conversion succeeds addr will hold a reference to a new * PyBytes object containing bytes represenation of the system path * given in arg object. * Returns 1 on successful operation, 0 otherwise. * Py2 implementation is based on Py3's PyUnicode_FSConverter[1]. * Py3 implementation wraps call PyUnicode_FSConverter and eliminates * the cleanup support in order to make usage flow the same between * versions. * [1] https://github.com/python/cpython/blob/master/Objects/unicodeobject.c#L3818 */ static int pypath_converter(PyObject* arg, void* addr) { assert(arg && "path converter does not support cleanup (arg is NULL)"); #if PY_MAJOR_VERSION == 2 /* python 2 implementation */ PyObject *output = NULL; Py_ssize_t size; const char *data; if (PyBytes_Check(arg)) { Py_INCREF(arg); output = arg; } else { output = PyUnicode_AsEncodedString(arg, Py_FileSystemDefaultEncoding, NULL); if (!output) return 0; assert(PyBytes_Check(output)); } size = PyBytes_GET_SIZE(output); data = PyBytes_AS_STRING(output); if ((size_t)size != strlen(data)) { PyErr_Format(PyExc_ValueError, "Embedded null byte"); Py_DECREF(output); return 0; } *(PyObject**)addr = output; return 1; #else /* python 3 call wrapper */ int rv = PyUnicode_FSConverter(arg, addr); /* python 2 does not suppot cleanups - same applies here */ if (rv == Py_CLEANUP_SUPPORTED) rv = 1; return rv; #endif } static uint64_t pyinteger_as_unsigned_long_long_mask(PyObject *obj) { #if PY_MAJOR_VERSION == 2 return PyInt_AsUnsignedLongLongMask(obj); #else return PyLong_AsUnsignedLongLongMask(obj); #endif } /* * Returns NULL-terminated representation of the contents of obj. * * obj must be a string object (py2) or Unicode object (py3), otherwise returns NULL * and raises TypeError.[1][2] * * The returned pointer refers to the internal buffer of string, not a copy. It must not be * deallocated, and the object must be kept alive as long as the retruned pointer is used. * [1] https://docs.python.org/2/c-api/string.html#c.PyString_AsString * [2] https://docs.python.org/3/c-api/unicode.html#c.PyUnicode_AsUTF8 * */ static const char* pystring_as_cstring(PyObject *obj) { #if PY_MAJOR_VERSION == 2 return PyString_AsString(obj); #else return PyUnicode_AsUTF8(obj); #endif } static int validate_path(PyObject *path) { if (PyBytes_Size(path) > SANLK_PATH_LEN - 1) { set_error(PyExc_ValueError, "Path is too long: %s", path); return 0; } return 1; } static int parse_single_disk(PyObject* disk, struct sanlk_disk* res_disk) { int rv = 0; PyObject *path = NULL; uint64_t offset; if (!PyTuple_Check(disk)) { set_error(PyExc_ValueError, "Invalid disk %s", disk); goto finally; } if (!PyArg_ParseTuple(disk, "O&K", pypath_converter, &path, &offset)) { /* Override the error since it confusing in this context. */ set_error(PyExc_ValueError, "Invalid disk %s", disk); goto finally; } if (!validate_path(path)) goto finally; strncpy(res_disk->path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); res_disk->offset = offset; rv = 1; finally: Py_XDECREF(path); return rv; } static struct sanlk_resource * create_resource(int num_disks) { size_t size = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk) * num_disks; struct sanlk_resource *res = calloc(1, size); if (res == NULL) { PyErr_NoMemory(); return NULL; } res->num_disks = num_disks; return res; } static int parse_disks(PyObject *obj, struct sanlk_resource **res_ret) { int num_disks; struct sanlk_resource *res; num_disks = PyList_Size(obj); res = create_resource(num_disks); if (res == NULL) return -1; for (int i = 0; i < num_disks; i++) { PyObject *disk = PyList_GetItem(obj,i); if (!parse_single_disk(disk, &(res->disks[i]))) { goto exit_fail; } } *res_ret = res; return 0; exit_fail: free(res); return -1; } enum {SECTOR_SIZE_512 = 512, SECTOR_SIZE_4K = 4096}; static int add_sector_flag(int sector, uint32_t *flags) { switch (sector) { case SECTOR_SIZE_512: *flags |= SANLK_LSF_SECTOR512; break; case SECTOR_SIZE_4K: *flags |= SANLK_LSF_SECTOR4K; break; default: PyErr_Format(PyExc_ValueError, "Invalid sector value: %d", sector); return -1; } return 0; } enum { ALIGNMENT_1M = 1048576, ALIGNMENT_2M = 2097152, ALIGNMENT_4M = 4194304, ALIGNMENT_8M = 8388608 }; static int add_align_flag(long align, uint32_t *flags) { switch (align) { case ALIGNMENT_1M: *flags |= SANLK_RES_ALIGN1M; break; case ALIGNMENT_2M: *flags |= SANLK_RES_ALIGN2M; break; case ALIGNMENT_4M: *flags |= SANLK_RES_ALIGN4M; break; case ALIGNMENT_8M: *flags |= SANLK_RES_ALIGN8M; break; default: PyErr_Format(PyExc_ValueError, "Invalid align value: %ld", align); return -1; } return 0; } static void set_error(PyObject* exception, const char* format, PyObject* obj) { const char* str_rep = ""; PyObject* rep = PyObject_Repr(obj); if (rep) str_rep = pystring_as_cstring(rep); PyErr_Format(exception, format, str_rep); Py_XDECREF(rep); } static PyObject * hosts_to_list(struct sanlk_host *hss, int hss_count) { PyObject *ls_list = PyList_New(hss_count); if (ls_list == NULL) goto exit_fail; for (int i = 0; i < hss_count; i++) { PyObject *ls_entry = Py_BuildValue( "{s:K,s:K,s:K,s:I,s:I}", "host_id", hss[i].host_id, "generation", hss[i].generation, "timestamp", hss[i].timestamp, "io_timeout", hss[i].io_timeout, "flags", hss[i].flags); if (ls_entry == NULL) goto exit_fail; /* Steals reference to ls_entry. */ if (PyList_SetItem(ls_list, i, ls_entry) != 0) { Py_DECREF(ls_entry); goto exit_fail; } } return ls_list; exit_fail: Py_XDECREF(ls_list); return NULL; } /* register */ PyDoc_STRVAR(pydoc_register, "\ register() -> int\n\ Register to sanlock daemon and return the connection fd."); static PyObject * py_register(PyObject *self __unused, PyObject *args) { int sanlockfd; /* This sholdn't block, but we don't want to take any chance, as blocking * hangs all threads in the caller process. */ Py_BEGIN_ALLOW_THREADS sanlockfd = sanlock_register(); Py_END_ALLOW_THREADS if (sanlockfd < 0) { set_sanlock_error(sanlockfd, "Sanlock registration failed"); return NULL; } return Py_BuildValue("i", sanlockfd); } /* get_alignment */ PyDoc_STRVAR(pydoc_get_alignment, "\ get_alignment(path) -> int\n\ Get device alignment."); static PyObject * py_get_alignment(PyObject *self __unused, PyObject *args) { int rv = -1; PyObject *path = NULL; struct sanlk_disk disk = {0}; /* parse python tuple */ if (!PyArg_ParseTuple(args, "O&", pypath_converter, &path)) { goto finally; } strncpy(disk.path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* get device alignment (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_direct_align(&disk); Py_END_ALLOW_THREADS if (rv < 0) { set_sanlock_error(rv, "Unable to get device alignment"); goto finally; } finally: Py_XDECREF(path); if (rv < 0) return NULL; return Py_BuildValue("i", rv); } /* * Convert parsed arg into PyBytes object. * For Python 2: * If arg is unicode onject, ascii encode it to new PyBytes object passed by addr. * If arg is a bytes object, inc its refcount and pass it in addr. * Set TypeError and return 0 if arg doens not comply to any of the above. * Return 1 on a successful conversion. * For Python 3: * If arg is a bytes object, inc its refcount and pass it in addr. * Set TypeError and return 0 otherwise. * Return 1 on a successful conversion. */ static int convert_to_pybytes(PyObject* arg, void *addr) { assert(arg && "convert_to_pybytes called with NULL arg"); #if PY_MAJOR_VERSION == 2 if (PyUnicode_Check(arg)) { PyObject *bytes = PyUnicode_AsASCIIString(arg); if (bytes == NULL) return 0; *(PyObject **)addr = bytes; return 1; } #endif if (PyBytes_Check(arg)) { Py_INCREF(arg); *(PyObject **)addr = arg; return 1; } set_error(PyExc_TypeError, "Argument type is not bytes: %s", arg); return 0; } /* init_lockspace */ PyDoc_STRVAR(pydoc_init_lockspace, "\ init_lockspace(lockspace, path, offset=0, max_hosts=0, num_hosts=0, \ use_aio=True)\n\ *DEPRECATED* use write_lockspace instead.\n\ Initialize a device to be used as sanlock lockspace."); static PyObject * py_init_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, max_hosts = 0, num_hosts = 0, use_aio = 1; PyObject *lockspace = NULL; PyObject *path = NULL; struct sanlk_lockspace ls = {0}; static char *kwlist[] = {"lockspace", "path", "offset", "max_hosts", "num_hosts", "use_aio", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&|kiii", kwlist, convert_to_pybytes, &lockspace, pypath_converter, &path, &ls.host_id_disk.offset, &max_hosts, &num_hosts, &use_aio)) { goto finally; } /* prepare sanlock names */ strncpy(ls.name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* init sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_direct_init(&ls, NULL, max_hosts, num_hosts, use_aio); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock lockspace init failure"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(path); if (rv != 0) return NULL; Py_RETURN_NONE; } /* init_resource */ PyDoc_STRVAR(pydoc_init_resource, "\ init_resource(lockspace, resource, disks, max_hosts=0, num_hosts=0, \ use_aio=True)\n\ *DEPRECATED* use write_resource instead.\n\ Initialize a device to be used as sanlock resource.\n\ The disks must be in the format: [(path, offset), ... ]"); static PyObject * py_init_resource(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, max_hosts = 0, num_hosts = 0, use_aio = 1; PyObject *lockspace = NULL, *resource = NULL; struct sanlk_resource *res = NULL; PyObject *disks; static char *kwlist[] = {"lockspace", "resource", "disks", "max_hosts", "num_hosts", "use_aio", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&O!|iii", kwlist, convert_to_pybytes, &lockspace, convert_to_pybytes, &resource, &PyList_Type, &disks, &max_hosts, &num_hosts, &use_aio)) { goto finally; } /* parse and check sanlock resource */ if (parse_disks(disks, &res) < 0) { goto finally; } /* prepare sanlock names */ strncpy(res->lockspace_name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(res->name, PyBytes_AsString(resource), SANLK_NAME_LEN); /* init sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_direct_init(NULL, res, max_hosts, num_hosts, use_aio); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock resource init failure"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(resource); free(res); if (rv != 0) return NULL; Py_RETURN_NONE; } /* write_lockspace */ PyDoc_STRVAR(pydoc_write_lockspace, "\ write_lockspace(lockspace, path, offset=0, max_hosts=0, iotimeout=0, \ align=1048576, sector=512)\n\ Initialize or update a device to be used as sanlock lockspace.\n\ Align can be one of (1048576, 2097152, 4194304, 8388608).\n\ Sector can be one of (512, 4096)."); static PyObject * py_write_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, max_hosts = 0, sector = SECTOR_SIZE_512; long align = ALIGNMENT_1M; uint32_t io_timeout = 0; PyObject *lockspace = NULL; PyObject *path = NULL; struct sanlk_lockspace ls = {0}; static char *kwlist[] = {"lockspace", "path", "offset", "max_hosts", "iotimeout", "align", "sector", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&|kiIli", kwlist, convert_to_pybytes, &lockspace, pypath_converter, &path, &ls.host_id_disk.offset, &max_hosts, &io_timeout, &align, §or)) { goto finally; } /* prepare sanlock names */ strncpy(ls.name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* set alignment/sector flags */ if (add_align_flag(align, &ls.flags) == -1) goto finally; if (add_sector_flag(sector, &ls.flags) == -1) goto finally; /* write sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_write_lockspace(&ls, max_hosts, 0, io_timeout); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock lockspace write failure"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(path); if (rv != 0) return NULL; Py_RETURN_NONE; } /* read_lockspace */ PyDoc_STRVAR(pydoc_read_lockspace, "\ read_lockspace(path, offset=0, align=1048576, sector=512)\n -> dict\n\ Read the lockspace information from a device at a specific offset.\n\ Align can be one of (1048576, 2097152, 4194304, 8388608).\n\ Sector can be one of (512, 4096)."); static PyObject * py_read_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, sector = SECTOR_SIZE_512; long align = ALIGNMENT_1M; uint32_t io_timeout = 0; PyObject *path = NULL; struct sanlk_lockspace ls = {0}; PyObject *ls_info = NULL; static char *kwlist[] = {"path", "offset", "align", "sector", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&|kli", kwlist, pypath_converter, &path, &ls.host_id_disk.offset, &align, §or)) { goto finally; } /* prepare sanlock names */ strncpy(ls.host_id_disk.path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* set alignment/sector flags */ if (add_align_flag(align, &ls.flags) == -1) goto finally; if (add_sector_flag(sector, &ls.flags) == -1) goto finally; /* read sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_read_lockspace(&ls, 0, &io_timeout); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock lockspace read failure"); goto finally; } /* fill the information dictionary */ ls_info = Py_BuildValue( #if PY_MAJOR_VERSION == 2 "{s:s,s:I}", #else "{s:y,s:I}", #endif "lockspace", ls.name, "iotimeout", io_timeout); if (ls_info == NULL) goto finally; finally: Py_XDECREF(path); if (rv != 0) return NULL; return ls_info; } /* read_resource */ PyDoc_STRVAR(pydoc_read_resource, "\ read_resource(path, offset=0, align=1048576, sector=512) -> dict\n\ Read the resource information from a device at a specific offset.\n\ Align can be one of (1048576, 2097152, 4194304, 8388608).\n\ Sector can be one of (512, 4096)."); static PyObject * py_read_resource(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, sector = SECTOR_SIZE_512; long align = ALIGNMENT_1M; PyObject *path = NULL; struct sanlk_resource *res; PyObject *res_info = NULL; static char *kwlist[] = {"path", "offset", "align", "sector", NULL}; res = create_resource(1 /* num_disks */); if (res == NULL) return NULL; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&|kli", kwlist, pypath_converter, &path, &(res->disks[0].offset), &align, §or)) { goto finally; } if (!validate_path(path)) goto finally; /* prepare the resource disk path */ strncpy(res->disks[0].path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* set alignment/sector flags */ if (add_align_flag(align, &res->flags) == -1) goto finally; if (add_sector_flag(sector, &res->flags) == -1) goto finally; /* read sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_read_resource(res, 0); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock resource read failure"); goto finally; } /* prepare the dictionary holding the information */ res_info = Py_BuildValue( #if PY_MAJOR_VERSION == 2 "{s:s,s:s,s:K}", #else "{s:y,s:y,s:K}", #endif "lockspace", res->lockspace_name, "resource", res->name, "version", res->lver); if (res_info == NULL) goto finally; finally: free(res); Py_XDECREF(path); if (rv != 0) { Py_XDECREF(res_info); return NULL; } return res_info; } /* write_resource */ PyDoc_STRVAR(pydoc_write_resource, "\ write_resource(lockspace, resource, disks, max_hosts=0, num_hosts=0, \ clear=False, align=1048576, sector=512)\n\ Initialize a device to be used as sanlock resource.\n\ The disks must be in the format: [(path, offset), ... ].\n\ If clear is True, the resource is cleared so subsequent read will\n\ return an error.\n\ Align can be one of (1048576, 2097152, 4194304, 8388608).\n\ Sector can be one of (512, 4096)."); static PyObject * py_write_resource(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, max_hosts = 0, num_hosts = 0, clear = 0, sector = SECTOR_SIZE_512; long align = ALIGNMENT_1M; PyObject *lockspace = NULL, *resource = NULL; struct sanlk_resource *res = NULL; PyObject *disks; uint32_t flags = 0; static char *kwlist[] = {"lockspace", "resource", "disks", "max_hosts", "num_hosts", "clear", "align", "sector", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&O!|iiili", kwlist, convert_to_pybytes, &lockspace, convert_to_pybytes, &resource, &PyList_Type, &disks, &max_hosts, &num_hosts, &clear, &align, §or)) { goto finally; } /* parse and check sanlock resource */ if (parse_disks(disks, &res) < 0) { goto finally; } /* prepare sanlock names */ strncpy(res->lockspace_name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(res->name, PyBytes_AsString(resource), SANLK_NAME_LEN); /* set alignment/sector flags */ if (add_align_flag(align, &res->flags) == -1) goto finally; if (add_sector_flag(sector, &res->flags) == -1) goto finally; if (clear) { flags |= SANLK_WRITE_CLEAR; } /* init sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_write_resource(res, max_hosts, num_hosts, flags); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock resource write failure"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(resource); free(res); if (rv != 0) return NULL; Py_RETURN_NONE; } /* add_lockspace */ PyDoc_STRVAR(pydoc_add_lockspace, "\ add_lockspace(lockspace, host_id, path, offset=0, iotimeout=0, wait=True)\n\ Add a lockspace, acquiring a host_id in it. If wait is False the function\n\ will return immediatly and the status can be checked using inq_lockspace.\n\ The iotimeout option configures the io timeout for the specific lockspace,\n\ overriding the default value (see the sanlock daemon parameter -o)."); static PyObject * py_add_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, flags = 0; int wait = 1; uint32_t iotimeout = 0; PyObject *lockspace = NULL; PyObject *path = NULL; struct sanlk_lockspace ls = {0}; static char *kwlist[] = {"lockspace", "host_id", "path", "offset", "iotimeout", "wait", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&kO&|kIi", kwlist, convert_to_pybytes, &lockspace, &ls.host_id, pypath_converter, &path, &ls.host_id_disk.offset, &iotimeout, &wait)) { goto finally; } /* prepare sanlock_add_lockspace flags */ if (!wait) { flags |= SANLK_ADD_ASYNC; } /* prepare sanlock names */ strncpy(ls.name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* add sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_add_lockspace_timeout(&ls, flags, iotimeout); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock lockspace add failure"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(path); if (rv != 0 ) return NULL; Py_RETURN_NONE; } /* inq_lockspace */ PyDoc_STRVAR(pydoc_inq_lockspace, "\ inq_lockspace(lockspace, host_id, path, offset=0, wait=False)\n\ Return True if the sanlock daemon currently owns the host_id in lockspace,\n\ False otherwise. The special value None is returned when the daemon is\n\ still in the process of acquiring or releasing the host_id. If the wait\n\ flag is set to True the function will block until the host_id is either\n\ acquired or released."); static PyObject * py_inq_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = BIND_ERROR, waitrs = 0, flags = 0; PyObject *lockspace = NULL; PyObject *path = NULL; struct sanlk_lockspace ls = {0}; static char *kwlist[] = {"lockspace", "host_id", "path", "offset", "wait", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&kO&|ki", kwlist, convert_to_pybytes, &lockspace, &ls.host_id, pypath_converter, &path, &ls.host_id_disk.offset, &waitrs)) { goto finally; } /* prepare sanlock_inq_lockspace flags */ if (waitrs) { flags |= SANLK_INQ_WAIT; } /* prepare sanlock names */ strncpy(ls.name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* add sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_inq_lockspace(&ls, flags); Py_END_ALLOW_THREADS finally: Py_XDECREF(lockspace); Py_XDECREF(path); if (rv == BIND_ERROR) { return NULL; } else if (rv == 0) { Py_RETURN_TRUE; } else if (rv == -ENOENT) { Py_RETURN_FALSE; } else if (rv == -EINPROGRESS) { Py_RETURN_NONE; } set_sanlock_error(rv, "Sanlock lockspace inquire failure"); return NULL; } /* rem_lockspace */ PyDoc_STRVAR(pydoc_rem_lockspace, "\ rem_lockspace(lockspace, host_id, path, offset=0, wait=True, unused=False)\n\ Remove a lockspace, releasing the acquired host_id. If wait is False the\n\ function will return immediately and the status can be checked using\n\ inq_lockspace. If unused is True the command will fail (EBUSY) if there is\n\ at least one acquired resource in the lockspace. Otherwise (the default)\n\ sanlock will try to terminate processes holding resource leases and upon\n\ successful termination these leases will be released."); static PyObject * py_rem_lockspace(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, unused = 0, flags = 0; int wait = 1; PyObject *lockspace = NULL; PyObject *path = NULL; struct sanlk_lockspace ls = {0}; static char *kwlist[] = {"lockspace", "host_id", "path", "offset", "wait", "unused", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&kO&|kii", kwlist, convert_to_pybytes, &lockspace, &ls.host_id, pypath_converter, &path, &ls.host_id_disk.offset, &wait, &unused)) { goto finally; } /* prepare sanlock names */ strncpy(ls.name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(ls.host_id_disk.path, PyBytes_AsString(path), SANLK_PATH_LEN - 1); /* prepare sanlock_rem_lockspace flags */ if (!wait) { flags |= SANLK_REM_ASYNC; } if (unused) { flags |= SANLK_REM_UNUSED; } /* remove sanlock lockspace (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_rem_lockspace(&ls, flags); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock lockspace remove failure"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(path); if (rv != 0) return NULL; Py_RETURN_NONE; } static PyObject * lockspaces_to_list(struct sanlk_lockspace *lss, int lss_count) { PyObject *ls_list = PyList_New(lss_count); if (ls_list == NULL) goto exit_fail; for (int i = 0; i < lss_count; i++) { PyObject *ls_entry = Py_BuildValue( #if PY_MAJOR_VERSION == 2 "{s:s,s:K,s:s,s:K,s:I}", #else "{s:y,s:K,s:s,s:K,s:I}", #endif "lockspace", lss[i].name, "host_id", lss[i].host_id, "path", lss[i].host_id_disk.path, "offset", lss[i].host_id_disk.offset, "flags", lss[i].flags); if (ls_entry == NULL) goto exit_fail; /* Steals reference to ls_entry. */ if (PyList_SetItem(ls_list, i, ls_entry) != 0) { Py_DECREF(ls_entry); goto exit_fail; } } return ls_list; exit_fail: Py_XDECREF(ls_list); return NULL; } /* get_lockspaces */ PyDoc_STRVAR(pydoc_get_lockspaces, "\ get_lockspaces() -> list\n\ Return the list of lockspaces currently managed by sanlock. The reported\n\ flag indicates whether the lockspace is acquired (0) or in transition.\n\ The possible transition values are LSFLAG_ADD if the lockspace is in the\n\ process of being acquired, and LSFLAG_REM if it's in the process of being\n\ released.\n"); static PyObject * py_get_lockspaces(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv, lss_count; struct sanlk_lockspace *lss = NULL; PyObject *ls_list = NULL; /* get all the lockspaces (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_get_lockspaces(&lss, &lss_count, 0); Py_END_ALLOW_THREADS if (rv < 0) { set_sanlock_error(rv, "Sanlock get lockspaces failure"); goto finally; } ls_list = lockspaces_to_list(lss, lss_count); finally: free(lss); return ls_list; } /* get_hosts */ PyDoc_STRVAR(pydoc_get_hosts, "\ get_hosts(lockspace, host_id=0) -> list\n\ Return the list of hosts currently alive in a lockspace. When the host_id\n\ is specified then only the requested host status is returned. The reported\n\ flag indicates whether the host is free (HOST_FREE), alive (HOST_LIVE),\n\ failing (HOST_FAIL), dead (HOST_DEAD) or unknown (HOST_UNKNOWN).\n\ The unknown state is the default when sanlock just joined the lockspace\n\ and didn't collect enough information to determine the real status of other\n\ hosts. The dictionary returned also contains: the generation, the last\n\ timestamp and the io_timeout.\n"); static PyObject * py_get_hosts(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, hss_count = 0; uint64_t host_id = 0; PyObject *lockspace = NULL; struct sanlk_host *hss = NULL; PyObject *ls_list = NULL; static char *kwlist[] = {"lockspace", "host_id", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&|k", kwlist, convert_to_pybytes, &lockspace, &host_id)) { goto finally; } /* get all the lockspaces (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_get_hosts(PyBytes_AsString(lockspace), host_id, &hss, &hss_count, 0); Py_END_ALLOW_THREADS if (rv < 0) { set_sanlock_error(rv, "Sanlock get hosts failure"); goto finally; } ls_list = hosts_to_list(hss, hss_count); finally: Py_XDECREF(lockspace); free(hss); if (rv < 0) return NULL; return ls_list; } /* acquire */ PyDoc_STRVAR(pydoc_acquire, "\ acquire(lockspace, resource, disks \ [, slkfd=fd, pid=owner, shared=False, version=None])\n\ Acquire a resource lease for the current process (using the slkfd argument\n\ to specify the sanlock file descriptor) or for an other process (using the\n\ pid argument). If shared is True the resource will be acquired in the shared\n\ mode. The version is the version of the lease that must be acquired or fail.\n\ The disks must be in the format: [(path, offset), ... ]\n"); static PyObject * py_acquire(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, sanlockfd = -1, pid = -1, shared = 0; PyObject *lockspace = NULL, *resource = NULL; struct sanlk_resource *res = NULL; PyObject *disks, *version = Py_None; static char *kwlist[] = {"lockspace", "resource", "disks", "slkfd", "pid", "shared", "version", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&O!|iiiO", kwlist, convert_to_pybytes, &lockspace, convert_to_pybytes, &resource, &PyList_Type, &disks, &sanlockfd, &pid, &shared, &version)) { goto finally; } /* check if any of the slkfd or pid parameters was given */ if (sanlockfd == -1 && pid == -1) { set_sanlock_error(EINVAL, "Invalid slkfd and pid values"); goto finally; } /* parse and check sanlock resource */ if (parse_disks(disks, &res) < 0) { goto finally; } /* prepare sanlock names */ strncpy(res->lockspace_name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(res->name, PyBytes_AsString(resource), SANLK_NAME_LEN); /* prepare sanlock flags */ if (shared) { res->flags |= SANLK_RES_SHARED; } /* prepare the resource version */ if (version != Py_None) { res->flags |= SANLK_RES_LVER; res->lver = pyinteger_as_unsigned_long_long_mask(version); if (res->lver == (uint64_t)-1) { set_sanlock_error(EINVAL, "Unable to convert the version value"); goto finally; } } /* acquire sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_acquire(sanlockfd, pid, 0, 1, &res, 0); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock resource not acquired"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(resource); free(res); if (rv != 0) return NULL; Py_RETURN_NONE; } /* release */ PyDoc_STRVAR(pydoc_release, "\ release(lockspace, resource, disks [, slkfd=fd, pid=owner])\n\ Release a resource lease for the current process.\n\ The disks must be in the format: [(path, offset), ... ]"); static PyObject * py_release(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, sanlockfd = -1, pid = -1; PyObject *lockspace = NULL, *resource = NULL; struct sanlk_resource *res = NULL; PyObject *disks; static char *kwlist[] = {"lockspace", "resource", "disks", "slkfd", "pid", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&O!|ii", kwlist, convert_to_pybytes, &lockspace, convert_to_pybytes, &resource, &PyList_Type, &disks, &sanlockfd, &pid)) { goto finally; } /* parse and check sanlock resource */ if (parse_disks(disks, &res) < 0) { goto finally; } /* prepare sanlock names */ strncpy(res->lockspace_name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(res->name, PyBytes_AsString(resource), SANLK_NAME_LEN); /* release sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_release(sanlockfd, pid, 0, 1, &res); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock resource not released"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(resource); free(res); if (rv != 0) return NULL; Py_RETURN_NONE; } /* request */ PyDoc_STRVAR(pydoc_request, "\ request(lockspace, resource, disks [, action=REQ_GRACEFUL, version=None])\n\ Request the owner of a resource to do something specified by action.\n\ The possible values for action are: REQ_GRACEFUL to request a graceful\n\ release of the resource and REQ_FORCE to sigkill the owner of the\n\ resource (forcible release). The version should be either the next version\n\ to acquire or None (which automatically uses the next version).\n\ The disks must be in the format: [(path, offset), ... ]"); static PyObject * py_request(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, action = SANLK_REQ_GRACEFUL, flags = 0; PyObject *lockspace = NULL, *resource = NULL; struct sanlk_resource *res = NULL; PyObject *disks, *version = Py_None; static char *kwlist[] = {"lockspace", "resource", "disks", "action", "version", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&O!|iO", kwlist, convert_to_pybytes, &lockspace, convert_to_pybytes, &resource, &PyList_Type, &disks, &action, &version)) { goto finally; } /* parse and check sanlock resource */ if (parse_disks(disks, &res) < 0) { goto finally; } /* prepare sanlock names */ strncpy(res->lockspace_name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(res->name, PyBytes_AsString(resource), SANLK_NAME_LEN); /* prepare the resource version */ if (version == Py_None) { flags = SANLK_REQUEST_NEXT_LVER; } else { res->flags |= SANLK_RES_LVER; res->lver = pyinteger_as_unsigned_long_long_mask(version); if (res->lver == (uint64_t)-1) { set_sanlock_error(EINVAL, "Unable to convert the version value"); goto finally; } } /* request sanlock resource (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_request(flags, action, res); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Sanlock request not submitted"); goto finally; } finally: Py_XDECREF(lockspace); Py_XDECREF(resource); free(res); if (rv != 0) return NULL; Py_RETURN_NONE; } /* read_resource_owners */ PyDoc_STRVAR(pydoc_read_resource_owners, "\ read_resource_owners(lockspace, resource, disks, align=1048576, sector=512) \ -> list\n\ Returns the list of hosts owning a resource, the list is not filtered and\n\ it might contain hosts that are currently failing or dead. The hosts are\n\ returned in the same format used by get_hosts.\n\ The disks must be in the format: [(path, offset), ... ].\n\ Align can be one of (1048576, 2097152, 4194304, 8388608).\n\ Sector can be one of (512, 4096)."); static PyObject * py_read_resource_owners(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, hss_count = 0; int sector = SECTOR_SIZE_512; long align = ALIGNMENT_1M; PyObject *lockspace = NULL, *resource = NULL; struct sanlk_resource *res = NULL; struct sanlk_host *hss = NULL; PyObject *disks, *ls_list = NULL; static char *kwlist[] = {"lockspace", "resource", "disks", "align", "sector", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O&O!|li", kwlist, convert_to_pybytes, &lockspace, convert_to_pybytes, &resource, &PyList_Type, &disks, &align, §or)) { goto finally; } /* parse and check sanlock resource */ if (parse_disks(disks, &res) < 0) { goto finally; } /* prepare sanlock names */ strncpy(res->lockspace_name, PyBytes_AsString(lockspace), SANLK_NAME_LEN); strncpy(res->name, PyBytes_AsString(resource), SANLK_NAME_LEN); /* set resource alignment and sector flags */ if (add_align_flag(align, &res->flags) == -1) goto finally; if (add_sector_flag(sector, &res->flags) == -1) goto finally; /* read resource owners (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_read_resource_owners(res, 0, &hss, &hss_count); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Unable to read resource owners"); goto finally; } ls_list = hosts_to_list(hss, hss_count); finally: Py_XDECREF(lockspace); Py_XDECREF(resource); free(res); free(hss); if (rv != 0) return NULL; return ls_list; } static int parse_killpath_item(PyObject *item, char *kpargs, size_t *kplen) { int rv = 0; size_t arg_len = 0; PyObject *path = NULL; const char *p = NULL; if (!pypath_converter(item, &path)) { goto finally; } p = PyBytes_AsString(path); if (!p) { goto finally; } /* computing the argument length considering the escape chars */ for (int i = 0; p[i]; i++, arg_len++) { if (p[i] == ' ' || p[i] == '\\') arg_len++; } /* adding 2 for the space separator ' ' and the '\0' terminator */ if (*kplen + arg_len + 2 > SANLK_HELPER_ARGS_LEN) { set_sanlock_error(EINVAL, "Killpath arguments are too long"); goto finally; } /* adding the space separator between arguments */ if (*kplen > 0) { kpargs[(*kplen)++] = ' '; } while (*p) { if (*p == ' ' || *p == '\\') { kpargs[(*kplen)++] = '\\'; } kpargs[(*kplen)++] = *p++; } rv = 1; finally: Py_XDECREF(path); return rv; } /* killpath */ PyDoc_STRVAR(pydoc_killpath, "\ killpath(path, args [, slkfd=fd])\n\ Configure the path and arguments of the executable used to fence a\n\ process either by causing the pid to exit (kill) or putting it into\n\ a safe state (resources released).\n\ The arguments must be in the format: [\"arg1\", \"arg2\", ...]"); static PyObject * py_killpath(PyObject *self __unused, PyObject *args, PyObject *keywds) { int rv = -1, num_args, sanlockfd = -1; size_t kplen = 0; char kpargs[SANLK_HELPER_ARGS_LEN] = ""; PyObject *path = NULL; PyObject *argslist; static char *kwlist[] = {"path", "args", "slkfd", NULL}; /* parse python tuple */ if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&O!|i", kwlist, pypath_converter, &path, &PyList_Type, &argslist, &sanlockfd)) { goto finally; } /* checking the path length */ if (PyBytes_Size(path) + 1 > SANLK_HELPER_PATH_LEN) { set_sanlock_error(EINVAL, "Killpath path argument too long"); goto finally; } num_args = PyList_Size(argslist); /* creating the arguments string from a python list */ for (int i = 0; i < num_args; i++) { PyObject *item = PyList_GetItem(argslist, i); if (!parse_killpath_item(item, kpargs, &kplen)) { goto finally; } } /* configure killpath (gil disabled) */ Py_BEGIN_ALLOW_THREADS rv = sanlock_killpath(sanlockfd, 0, PyBytes_AsString(path), kpargs); Py_END_ALLOW_THREADS if (rv != 0) { set_sanlock_error(rv, "Killpath script not configured"); goto finally; } finally: Py_XDECREF(path); if (rv != 0) return NULL; Py_RETURN_NONE; } /* exception_errno */ PyDoc_STRVAR(pydoc_errno, "exception errno"); static PyObject * py_exception_errno(PyObject *self, PyBaseExceptionObject *exc_obj) { PyObject *exc_errno; exc_errno = PyTuple_GetItem(exc_obj->args, 0); if (exc_errno == NULL) return NULL; Py_INCREF(exc_errno); return exc_errno; } /* reg_event */ PyDoc_STRVAR(pydoc_reg_event, "\ reg_event(lockspace) -> int\n\ Register an event listener for lockspace and return an open file descriptor\n\ for waiting for lockspace events. When the file descriptor becomes readable,\n\ you can use get_event to get pending events. When you are done, you must\n\ unregister the event listener using end_event."); static PyObject * py_reg_event(PyObject *self __unused, PyObject *args) { PyObject *lockspace = NULL; int fd = -1; if (!PyArg_ParseTuple(args, "O&", convert_to_pybytes, &lockspace)) { goto finally; } Py_BEGIN_ALLOW_THREADS fd = sanlock_reg_event(PyBytes_AsString(lockspace), NULL /* event */, 0 /* flags */); Py_END_ALLOW_THREADS if (fd < 0) { set_sanlock_error(fd, "Unable to register event fd"); goto finally; } finally: Py_XDECREF(lockspace); if (fd < 0) return NULL; return Py_BuildValue("i", fd); } /* get_event */ PyDoc_STRVAR(pydoc_get_event, "\ get_event(fd) -> list\n\ Get list of lockspace events.\n\ \n\ Each event is a dictionary with the following keys:\n\ from_host_id host id of the host setting this event (int)\n\ from_generation host generation of the host setting this event (int)\n\ host_id my host id (int)\n\ generation my generation where the event was set (int)\n\ event event number (int)\n\ data optional event data (int)\n\ "); static PyObject * py_get_event(PyObject *self __unused, PyObject *args) { int fd = -1; struct sanlk_host_event he; uint64_t from_host_id; uint64_t from_generation; PyObject *events = NULL; PyObject *item = NULL; int rv; if (!PyArg_ParseTuple(args, "i", &fd)) return NULL; if ((events = PyList_New(0)) == NULL) goto exit_fail; for (;;) { Py_BEGIN_ALLOW_THREADS rv = sanlock_get_event(fd, 0, &he, &from_host_id, &from_generation); Py_END_ALLOW_THREADS if (rv == -EAGAIN) break; if (rv != 0) { set_sanlock_error(rv, "Unable to get events"); goto exit_fail; } item = Py_BuildValue( "{s:K,s:K,s:K,s:K,s:K,s:K}", "from_host_id", from_host_id, "from_generation", from_generation, "host_id", he.host_id, "generation", he.generation, "event", he.event, "data", he.data); if (item == NULL) goto exit_fail; if (PyList_Append(events, item) != 0) goto exit_fail; Py_CLEAR(item); } return events; exit_fail: Py_XDECREF(item); Py_XDECREF(events); return NULL; } /* end_event */ PyDoc_STRVAR(pydoc_end_event, "\ end_event(fd, lockspace)\n\ Unregister an event listener for lockspace registered with reg_event."); static PyObject * py_end_event(PyObject *self __unused, PyObject *args) { int fd = -1; PyObject *lockspace = NULL; int rv = -1; if (!PyArg_ParseTuple(args, "iO&", &fd, convert_to_pybytes, &lockspace)) { goto finally; } Py_BEGIN_ALLOW_THREADS rv = sanlock_end_event(fd, PyBytes_AsString(lockspace), 0 /* flags */); Py_END_ALLOW_THREADS if (rv < 0) { set_sanlock_error(rv, "Unable to unregister event fd"); goto finally; } finally: Py_XDECREF(lockspace); if (rv < 0) return NULL; Py_RETURN_NONE; } /* set_event */ PyDoc_STRVAR(pydoc_set_event, "\ set_event(lockspace, host_id, generation, event, data=0, flags=0)\n\ Set events to hosts on a lockspace.\n\ \n\ Arguments\n\ lockspace lockspace name (str)\n\ host_id recipient host_id (int)\n\ generation recipient generation (int)\n\ event event number (int)\n\ data optional event data (int)\n\ flags optional combination of event flags (int)\n\ \n\ Flags\n\ SETEV_CUR_GENERATION if generation is zero, use current host\n\ generation.\n\ SETEV_CLEAR_HOSTID clear the host_id in the next renewal so host_id\n\ will stop seeing this event. If the same event\n\ was sent to other hosts, they will continue to\n\ see the event until the event is cleared.\n\ SETEV_CLEAR_EVENT Clear the event/data/generation values in the\n\ next renewal, ending this event.\n\ SETEV_REPLACE_EVENT Replace the existing event/data values of the\n\ current event. Without this flag, the operation\n\ will raise SanlockException with -EBUSY error.\n\ SETEV_ALL_HOSTS set event for all hosts.\n\ \n\ Examples\n\ \n\ Send event 1 to host 42 on lockspace 'foo', using current host generation:\n\ set_event('foo', 42, 0, 1, flags=SETEV_CUR_GENERATION)\n\ \n\ Send the same event also to host 7 on lockspace 'foo', using current host\n\ generation. Both host 42 and host 7 will see the same event:\n\ set_event('foo', 7, 0, 1, flags=SETEV_CUR_GENERATION)\n\ \n\ Send event 3 to all hosts on lockspace 'foo', replacing previous events\n\ sent to other hosts. Note that you must use a valid host_id, but the\n\ generation is ignored:\n\ set_event('foo', 1, 0, 3, flags=SETEV_ALL_HOSTS|SETEV_REPLACE_EVENT)\n\ \n\ Notes\n\ \n\ Sequential set_events with different event/data values, within a short\n\ time span is likely to produce unwanted results, because the new\n\ event/data values replace the previous values before the previous values\n\ have been read.\n\ \n\ Unless SETEV_REPLACE_EVENT flag is used, sanlock will raise SanlockException\n\ with -EBUSY error in this case.\n\ "); static PyObject * py_set_event(PyObject *self __unused, PyObject *args, PyObject *keywds) { PyObject *lockspace = NULL; struct sanlk_host_event he = {0}; uint32_t flags = 0; int rv = -1; static char *kwlist[] = {"lockspace", "host_id", "generation", "event", "data", "flags", NULL}; if (!PyArg_ParseTupleAndKeywords(args, keywds, "O&KKK|KI", kwlist, convert_to_pybytes, &lockspace, &he.host_id, &he.generation, &he.event, &he.data, &flags)) { goto finally; } Py_BEGIN_ALLOW_THREADS rv = sanlock_set_event(PyBytes_AsString(lockspace), &he, flags); Py_END_ALLOW_THREADS if (rv < 0) { set_sanlock_error(rv, "Unable to set event"); goto finally; } finally: Py_XDECREF(lockspace); if (rv < 0) return NULL; Py_RETURN_NONE; } static PyMethodDef sanlock_methods[] = { {"register", py_register, METH_NOARGS, pydoc_register}, {"get_alignment", py_get_alignment, METH_VARARGS, pydoc_get_alignment}, {"init_lockspace", (PyCFunction) py_init_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_init_lockspace}, {"init_resource", (PyCFunction) py_init_resource, METH_VARARGS|METH_KEYWORDS, pydoc_init_resource}, {"write_lockspace", (PyCFunction) py_write_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_write_lockspace}, {"write_resource", (PyCFunction) py_write_resource, METH_VARARGS|METH_KEYWORDS, pydoc_write_resource}, {"read_lockspace", (PyCFunction) py_read_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_read_lockspace}, {"read_resource", (PyCFunction) py_read_resource, METH_VARARGS|METH_KEYWORDS, pydoc_read_resource}, {"add_lockspace", (PyCFunction) py_add_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_add_lockspace}, {"inq_lockspace", (PyCFunction) py_inq_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_inq_lockspace}, {"rem_lockspace", (PyCFunction) py_rem_lockspace, METH_VARARGS|METH_KEYWORDS, pydoc_rem_lockspace}, {"get_lockspaces", (PyCFunction) py_get_lockspaces, METH_VARARGS|METH_KEYWORDS, pydoc_get_lockspaces}, {"get_hosts", (PyCFunction) py_get_hosts, METH_VARARGS|METH_KEYWORDS, pydoc_get_hosts}, {"read_resource_owners", (PyCFunction) py_read_resource_owners, METH_VARARGS|METH_KEYWORDS, pydoc_read_resource_owners}, {"acquire", (PyCFunction) py_acquire, METH_VARARGS|METH_KEYWORDS, pydoc_acquire}, {"release", (PyCFunction) py_release, METH_VARARGS|METH_KEYWORDS, pydoc_release}, {"request", (PyCFunction) py_request, METH_VARARGS|METH_KEYWORDS, pydoc_request}, {"killpath", (PyCFunction) py_killpath, METH_VARARGS|METH_KEYWORDS, pydoc_killpath}, {"reg_event", (PyCFunction) py_reg_event, METH_VARARGS, pydoc_reg_event}, {"get_event", (PyCFunction) py_get_event, METH_VARARGS, pydoc_get_event}, {"end_event", (PyCFunction) py_end_event, METH_VARARGS, pydoc_end_event}, {"set_event", (PyCFunction) py_set_event, METH_VARARGS|METH_KEYWORDS, pydoc_set_event}, {NULL, NULL, 0, NULL} }; static PyMethodDef sanlock_exception = { "errno", (PyCFunction) py_exception_errno, METH_O, pydoc_errno }; static PyObject * initexception(void) { PyObject *func = PyCFunction_New(&sanlock_exception, NULL); if (func == NULL) return NULL; PyObject *meth = PyObject_CallFunction((PyObject *) &PyProperty_Type, "O", func); Py_CLEAR(func); if (meth == NULL) return NULL; PyObject *dict = Py_BuildValue("{s:O}", sanlock_exception.ml_name, meth); Py_CLEAR(meth); if (dict == NULL) return NULL; PyObject *excp = PyErr_NewException("sanlock.SanlockException", NULL, dict); Py_CLEAR(dict); return excp; } static int module_init(PyObject* m) { if (py_exception == NULL) { py_exception = initexception(); if (py_exception == NULL) return -1; } Py_INCREF(py_exception); if (PyModule_AddObject(m, "SanlockException", py_exception)) { Py_DECREF(py_exception); return -1; } /* lockspaces list flags */ if (PyModule_AddIntConstant(m, "LSFLAG_ADD", SANLK_LSF_ADD)) return -1; if (PyModule_AddIntConstant(m, "LSFLAG_REM", SANLK_LSF_REM)) return -1; /* resource request flags */ if (PyModule_AddIntConstant(m, "REQ_FORCE", SANLK_REQ_FORCE)) return -1; if (PyModule_AddIntConstant(m, "REQ_GRACEFUL", SANLK_REQ_GRACEFUL)) return -1; /* hosts list flags */ if (PyModule_AddIntConstant(m, "HOST_FREE", SANLK_HOST_FREE)) return -1; if (PyModule_AddIntConstant(m, "HOST_LIVE", SANLK_HOST_LIVE)) return -1; if (PyModule_AddIntConstant(m, "HOST_FAIL", SANLK_HOST_FAIL)) return -1; if (PyModule_AddIntConstant(m, "HOST_DEAD", SANLK_HOST_DEAD)) return -1; if (PyModule_AddIntConstant(m, "HOST_UNKNOWN", SANLK_HOST_UNKNOWN)) return -1; /* set event flags */ if (PyModule_AddIntConstant(m, "SETEV_CUR_GENERATION", SANLK_SETEV_CUR_GENERATION)) return -1; if (PyModule_AddIntConstant(m, "SETEV_CLEAR_HOSTID", SANLK_SETEV_CLEAR_HOSTID)) return -1; if (PyModule_AddIntConstant(m, "SETEV_CLEAR_EVENT", SANLK_SETEV_CLEAR_EVENT)) return -1; if (PyModule_AddIntConstant(m, "SETEV_REPLACE_EVENT", SANLK_SETEV_REPLACE_EVENT)) return -1; if (PyModule_AddIntConstant(m, "SETEV_ALL_HOSTS", SANLK_SETEV_ALL_HOSTS)) return -1; /* Tuples with supported sector size and alignment values */ PyObject *sector = Py_BuildValue("ii", SECTOR_SIZE_512, SECTOR_SIZE_4K); if (!sector) return -1; if (PyModule_AddObject(m, "SECTOR_SIZE", sector)) { Py_DECREF(sector); return -1; } PyObject *align = Py_BuildValue( "llll", ALIGNMENT_1M, ALIGNMENT_2M, ALIGNMENT_4M, ALIGNMENT_8M); if (!align) return -1; if (PyModule_AddObject(m, "ALIGN_SIZE", align)) { Py_DECREF(align); return -1; } return 0; } #if PY_MAJOR_VERSION >= 3 /* Python 3 module init */ static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, MODULE_NAME, pydoc_sanlock, -1, sanlock_methods, }; PyMODINIT_FUNC PyInit_sanlock(void) { PyObject *m = PyModule_Create(&moduledef); if (m == NULL) return NULL; if (module_init(m)) { Py_DECREF(m); return NULL; } return m; } #else /* Python 2 module init */ PyMODINIT_FUNC initsanlock(void) { PyObject *m = Py_InitModule3( MODULE_NAME, sanlock_methods, pydoc_sanlock); if (m == NULL) return; /* We don't have anything to do if module_init() fails. */ module_init(m); } #endif /* vim: set expandtab shiftwidth=4 tabstop=4 : */ sanlock-3.8.2/python/setup.py000066400000000000000000000014051371427612200162160ustar00rootroot00000000000000# Copyright 2010-2019 Red Hat, Inc. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v.2. from distutils.core import setup, Extension sanlocklib = ['sanlock'] sanlock = Extension(name='sanlock', sources=['sanlock.c'], include_dirs=['../src'], library_dirs=['../src'], extra_compile_args=["-std=c99"], libraries=sanlocklib) version = None with open('../VERSION') as f: version = f.readline() setup(name='sanlock-python', version=version, description='Python bindings for the sanlock library', ext_modules=[sanlock]) sanlock-3.8.2/reset/000077500000000000000000000000001371427612200143055ustar00rootroot00000000000000sanlock-3.8.2/reset/Makefile000066400000000000000000000016411371427612200157470ustar00rootroot00000000000000include ../common.mk TARGET1 = sanlk-resetd TARGET2 = sanlk-reset SOURCE1 = sanlk_resetd.c SOURCE2 = sanlk_reset.c VER=$(shell cat ../VERSION) CFLAGS += -DVERSION=\"$(VER)\" -I../src -I../wdmd CFLAGS += -fPIE -DPIE LDFLAGS += -Wl,-z,relro -pie LDADD = -lsanlock -lwdmd all: $(TARGET1) $(TARGET2) $(TARGET1): $(SOURCE1) $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCE1) $(LDADD) -o $@ -L. -L../src -L../wdmd $(TARGET2): $(SOURCE2) $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCE2) $(LDADD) -o $@ -L. -L../src -L../wdmd clean: rm -f *.o *.so *.so.* $(TARGET1) $(TARGET2) INSTALL=$(shell which install) DESTDIR= BINDIR=/usr/sbin LIBDIR=/usr/lib64 HEADIR=/usr/include MANDIR=/usr/share/man .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -c -m 755 $(TARGET1) $(TARGET2) $(DESTDIR)/$(BINDIR) $(INSTALL) -m 644 sanlk-reset.8 $(DESTDIR)/$(MANDIR)/man8/ $(INSTALL) -m 644 sanlk-resetd.8 $(DESTDIR)/$(MANDIR)/man8/ sanlock-3.8.2/reset/sanlk-reset.8000066400000000000000000000035311371427612200166300ustar00rootroot00000000000000.TH SANLK-RESET 8 2014-08-14 .SH NAME sanlk-reset \- host reset program .SH SYNOPSIS .B sanlk\-reset [OPTIONS] .I action .IR lockspace_name " ..." .SH DESCRIPTION The sanlk\-reset program sets events in specified sanlock lockspaces. Events are defined to cause sanlk\-resetd on another host to: .br - use wdmd/watchdog to reset the host .br - use /proc/sysrq\-trigger to reboot the host Both hosts must be operational and have continued access to a common lockspace for the reset request to succeed. After setting the event, the sanlk\-reset program monitors the host status in the sanlock lockspace until the target host is dead. The sanlk\-reset program can also be run on the same host as sanlk\-resetd to update which lockspaces the local sanlk\-resetd is watching for events. .SH OPTIONS .TP .B \-\-version, \-V Print version. .TP .B \-\-help, \-h Print usage. .SS Reset another host The event is set in each lockspace specified. The target host may have a different host id in each lockspace. .B sanlk\-reset reset .IR lockspace_name:host_id " ..." .TP .BI "\-\-host\-id, \-i " num Host id to reset. (Use only with single lockspace name.) .TP .BI "\-\-generation, \-g " num Generation of host. (Use only with single lockspace name.) .TP .B \-\-sysrq\-reboot, \-b 0|1 Enable/Disable (1/0) use of /proc/sysrq\-trigger to reboot. .TP .B \-\-resource\-mode, \-R 0|1 Resource leases are used (1) or not used (0) to protect storage. .TP .B \-\-native\-timeout, \-t " num Disable native timeout by setting to 0. .SS Update the local sanlk\-resetd \& to watch new lockspaces for reset events: .B sanlk\-reset reg .IR lockspace_name " ..." to not watch lockspaces for reset events: .B sanlk\-reset end .IR lockspace_name " ..." to clear all lockspaces being watched: .B sanlk\-reset clear all .SH SEE ALSO .BR sanlk\-resetd (8) .BR sanlock (8) .BR wdmd (8) sanlock-3.8.2/reset/sanlk-resetd.8000066400000000000000000000026411371427612200167750ustar00rootroot00000000000000.TH SANLK\-RESETD 8 2014-08-14 .SH NAME sanlk\-resetd \- host reset daemon .SH SYNOPSIS .B sanlk\-resetd [OPTIONS] .IR lockspace_name " ..." .SH DESCRIPTION The sanlk\-resetd daemon gets events from specified sanlock lockspaces. Events are defined to cause sanlk\-resetd to: .br - use wdmd/watchdog to reset the host .br - use /proc/sysrq\-trigger to reboot the host The sanlk\-reset program can be run on another host to request that sanlk\-resetd reset the host it is running on. Both hosts must be operational and have continued access to a common lockspace for the reset request to succeed. After setting the event, the sanlk\-reset program monitors the host status in the sanlock lockspace until the target host is dead. The sanlk\-reset program can also be run on the same host as sanlk\-resetd to update which lockspaces the local sanlk\-resetd is watching for events. .SH OPTIONS .TP .B \-\-version, \-V Print version. .TP .B \-\-help, \-h Print usage. .TP .B \-\-foreground, \-f Don't fork. .TP .B \-\-daemon\-debug, \-D Enable debugging to stderr and don't fork. .TP .B \-\-sysrq\-reboot, \-b 0|1 Enable/Disable (1/0) use of /proc/sysrq\-trigger to reboot. .TP .BI "\-\-sysrq\-delay, \-d " sec Delay this many seconds before using /proc/sysrq\-trigger. .TP .B \-\-resource-mode, \-R 0|1 Resource leases are used (1) or not used (0) to protect storage. .SH SEE ALSO .BR sanlk\-reset (8) .BR sanlock (8) .BR wdmd (8) sanlock-3.8.2/reset/sanlk_reset.c000066400000000000000000000623311371427612200167700ustar00rootroot00000000000000/* * Copyright 2014 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" #include "sanlock_direct.h" static struct sockaddr_un update_addr; static socklen_t update_addrlen; #include "sanlk_reset.h" #define EXIT_USAGE 2 #define MAX_LS 64 /* * native timeout: calculate directly when a host's watchdog * should have fired, based on sanlock/wdmd/watchdog timings. * This can complete much quicker than waiting for the sanlock * host status states. The host status states are based on a * lockspace failing to renew a lease, and determining the * latest possible watchdog firing based on that. The reset * case is based on lockspace storage remaining accessable, * and a host acknowledging through a RESETTING event that * its watchdog has been set to expire. Until the watchdog * fires, the host will continue renewing its lease. We can * derive a faster completion based on the RESETTING event * rather than waiting for the sanlock host status, which * would still be correct, but longer, because it's based on * the obvious lack of renewals following the reset. So, * with native timeout, the completion timing begins from the * RESETTING acknowledgment, but with the sanlock host status, * the completion timing begins from the final renewal before * the reset. * * Native timeout calculation: * * This timeout calculation begins when we see the RESETTING * event from the host. It can take multiple lease renewal * intervals for the RESET and RESETTING events to be transmitted * between hosts. The time required for these events precedes the * timeout considered in the following. The total time required * for the program would be the sum of the time required for * the RESET/RESETTING events, and the native timeout. * * When we first see the host is resetting, make a record * of the local time (ls_resetting_begin_local) and the remote * timestamp from the delta lease renewal (ls_resetting_begin_timestamp). * * We'll continue watching the resetting host for another * local 90 seconds from the local time saved here. After * that 90 seconds, we'll check the last timestamp seen from * the host. If the last timestamp is more than 70 seconds after * than the first timestamp we saved upon seeing resetting, * then the host's watchdog device failed to fire when required * (watchdog_failed_to_fire). * * The resetting host should be reset (its watchdog should fire) * at most 70 seconds after it sets RESETTING. So, it should not * be possible for a renewal timestamp to be more than 70 seconds * later than the timestamp when it set the RESETTING event. * * T0: host sets RESETTING * T0: host sets up expired wdmd connection * T10: wdmd test interval wakes and sees new expired connection * T10: wdmd closes /dev/watchdog uncleanly, which issues final ping * T70: 60 seconds later, the watchdog device fires * * The host continues renewing its delta lease up until its watchdog * fires at T70, so there may be a renewal right at T70, 70 seconds * after it set RESETTING. * * We continue watching for a renewal for 20 seconds after this * latest possible renewal, to give the host time for another * renewal if its watchdog failed to fire. If we do not see another * renewal for 20 seconds (the max standard renewal interval), then * this is a confirmation that the watchdog fired as expected by T70. * * If the host's watchdog fails to fire, and storage access is maintained, * then the resetting host will continue to renew its lease. This program * will then see renewal timestamps later than T70, and will fail. * * If the host's watchdog fails to reset it, and the host also loses its * storage access, then this program will incorrectly conclude that the * host has been reset when it is not. * * The times (90, 70) are based on the following sanlock/wdmd defaults: * . 10 second io_timeout * . 60 second watchdog_fire_timeout * . 20 second id_renewal_seconds * . 10 second wdmd test interval * * If the resetting host has a different io_timeout, then disable * the native timeout check and depend on the host status check. */ #define NATIVE_TIMEOUT_SECONDS 90 #define NATIVE_RENEWAL_SECONDS 70 #define NATIVE_VERIFY_SECONDS (NATIVE_TIMEOUT_SECONDS - NATIVE_RENEWAL_SECONDS) static char *prog_name; static uint64_t begin; static struct pollfd *pollfd; static int use_watchdog = 1; static int use_sysrq_reboot = 0; static int resource_mode; static int debug_mode; static int target_host_id; static uint64_t target_generation; static int native_timeout = NATIVE_TIMEOUT_SECONDS; static int native_renewal = NATIVE_RENEWAL_SECONDS; static int watchdog_failed_to_fire; static int ls_count; static char *ls_names[MAX_LS]; static int ls_hostids[MAX_LS]; static int ls_fd[MAX_LS]; static uint64_t ls_resetting_begin_timestamp[MAX_LS]; static uint64_t ls_resetting_begin_local[MAX_LS]; static uint64_t ls_timestamp[MAX_LS]; static uint32_t ls_host_flags[MAX_LS]; static int ls_is_resetting[MAX_LS]; static int ls_is_dead[MAX_LS]; static int ls_is_free[MAX_LS]; static int ls_renewals[MAX_LS]; #define errlog(fmt, args...) \ do { \ fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) #define log_debug(fmt, args...) \ do { \ if (debug_mode) \ errlog(fmt, ##args); \ } while (0) #define log_info(fmt, args...) \ errlog(fmt, ##args) #define log_warn(fmt, args...) \ do { \ errlog(fmt, ##args); \ syslog(LOG_WARNING, fmt, ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ errlog(fmt, ##args); \ syslog(LOG_ERR, fmt, ##args); \ } while (0) static uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } static void unregister_ls(int i) { sanlock_end_event(ls_fd[i], ls_names[i], 0); ls_names[i] = NULL; ls_fd[i] = -1; pollfd[i].fd = -1; pollfd[i].events = 0; ls_count--; } static const char *host_flag_str(uint32_t flags) { switch(flags) { case SANLK_HOST_UNKNOWN: return "unknown"; case SANLK_HOST_FREE: return "free"; case SANLK_HOST_LIVE: return "live"; case SANLK_HOST_FAIL: return "fail"; case SANLK_HOST_DEAD: return "dead"; default: return "invalid"; }; } /* * When should we give up waiting for a host to be dead/done * and exit with a failure? * * If we have not seen a resetting reply from the host * and it has renewed its lease a number of times [+], * then it probably did not get the reset event or was * not able to perform it. * If this is true for all lockspaces, then give up. * * If we have not seen a resetting reply from the host * and it is now DEAD in the lockspace, then it probably * was reset/rebooted before its reply was written, or * it lost access to storage. * If this is true for all lockspaces, then give up. * * (When resource_mode is 1 we can wait until host is DEAD * and not require a resetting reply, so in that case we * might want an option to skip the early failure when * two renewals are received with no reply... * or just remove resource_mode 1?) * * [+] The number of lease renewals that can be * seen between set RESET and get RESETTING: * * host2 sanlock renews its lease * . writes TS1 * * host1 sanlock renews its lease * . reads TS1 * * host1 sanlk-reset set_event RESET * * host1 sanlk-reset get_hosts * . ls_timestamp = TS1 * * host2 sanlock renews its lease * . writes TS2 * * host1 sanlock renews its lease * . reads TS2 * . writes RESET * * host1 sanlk-reset get_hosts * . ls_timestamp = TS2 * . ls_renewals = 1 * * host2 sanlock renews its lease * . writes TS3 * . reads RESET * * host1 sanlock renews its lease * . reads TS3 * * host1 sanlk-reset get_hosts * . ls_timestamp = TS3 * . ls_renewals = 2 * * host2 sanlk-resetd get_event RESET * . set watchdog to fire * * host2 sanlk-resetd set_event RESETTING * * host2 sanlock renews its lease * . writes TS4 * . writes RESETTING * * host1 sanlock renews its lease * . reads TS4 * . reads RESETTING * * host1 sanlk-reset get_hosts * . ls_timestamp = TS4 * . ls_renewals = 3 * * host1 sanlk-reset get_event RESETTING * * So 4 timestamps (3 timestamp changes as counted in ls_renewals) * is the typical number seen by sanlk-reset. * The code below uses 4 renewals in case there is some unknown timing * skew, io delays, scheduling delays, that affect the count. */ static int reset_fail(void) { int cmd_fail = 0; int cmd_wait = 0; int i; if (watchdog_failed_to_fire) return 1; for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; if (ls_is_resetting[i]) { /* * sanlk-resetd on the host has replied that * it has set up its watchdog to reset it, so * in time it should become DEAD and be counted * as done in reset_done(). The time for the host * to be reported as DEAD is not something we can * compute exactly here, (and it depends on things * like io timeout). * * If the watchdog failed to reset the host (or * use_watchdog was turned off in sanlk-reset), * then we'll continue coming through here * indefinately. We want to eventually fail * in this case, so put a large upper bound * on the time we'll wait for the host state * to become DEAD. */ if (monotime() - begin > 300) { log_error("host watchdog reset failed in %s:%d", ls_names[i], ls_hostids[i]); cmd_fail++; } else { cmd_wait++; } continue; } /* * We have not seen a resetting reply from the host: * 1. we haven't waited long enough yet * 2. the host couldn't perform the reset and won't reply * 3. the host reset/rebooted too quickly before its reply could be written * 4. the host lost storage and didn't get the event * 5. the host lost storage and couldn't write a reply * 6. the host is not running sanlk-resetd * 7. the host left the lockspace * 8. the host had not joined any of the lockspaces where the event was set * 9. sanlk-resetd is not watching events in the ls where the event was set */ if (ls_is_dead[i]) { /* case 3, case 4, case 5 */ log_error("host is dead with no reply in %s:%d", ls_names[i], ls_hostids[i]); cmd_fail++; } else if (ls_is_free[i]) { /* case 7, case 8 */ log_error("host is free with no reply in %s:%d", ls_names[i], ls_hostids[i]); cmd_fail++; } else if (ls_renewals[i] >= 4) { /* case 2, case 6, case 9 */ log_error("host renewals %d with no reply in %s:%d", ls_renewals[i], ls_names[i], ls_hostids[i]); cmd_fail++; } else { /* case 1 */ cmd_wait++; } } if (cmd_fail && !cmd_wait) { log_error("reset failed: no replies in %d ls", cmd_fail); return 1; } return 0; } static int reset_done(void) { struct sanlk_host *hs; uint64_t now; uint64_t host_id; uint32_t state; int hs_count; int ls_is_done; int is_done = 0; int i, rv; /* * Get the state of the host in each lockspace. */ for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; hs_count = 0; hs = NULL; host_id = (uint64_t)ls_hostids[i]; rv = sanlock_get_hosts(ls_names[i], host_id, &hs, &hs_count, 0); if ((rv < 0) || (hs == NULL) || (hs_count != 1) || (hs->host_id != host_id)) { log_error("sanlock_get_hosts error %d ls %s:%d", rv, ls_names[i], ls_hostids[i]); if (hs) free(hs); continue; } if (ls_timestamp[i] && (ls_timestamp[i] != hs->timestamp)) ls_renewals[i]++; ls_timestamp[i] = hs->timestamp; ls_host_flags[i] = hs->flags; log_debug("%04u state %s reply %d timestamp %llu ls %s:%d", (uint32_t)(monotime() - begin), host_flag_str(ls_host_flags[i]), ls_is_resetting[i], (unsigned long long)hs->timestamp, ls_names[i], ls_hostids[i]); if (hs->timestamp && (hs->io_timeout != 10) && native_timeout) { log_error("disable native_timeout due to zero io_timeout in %s:%d", ls_names[i], ls_hostids[i]); native_timeout = 0; } free(hs); } /* * The native timeout check. */ if (!native_timeout) goto check_host_status; for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; if (!ls_is_resetting[i]) continue; now = monotime(); if (!ls_resetting_begin_local[i]) { ls_resetting_begin_timestamp[i] = ls_timestamp[i]; ls_resetting_begin_local[i] = now; log_debug("resetting begin local %llu timestamp %llu in ls %s:%d", (unsigned long long)ls_resetting_begin_local[i], (unsigned long long)ls_resetting_begin_timestamp[i], ls_names[i], ls_hostids[i]); } if (now - ls_resetting_begin_local[i] > native_timeout) { if (ls_timestamp[i] - ls_resetting_begin_timestamp[i] > native_renewal) { /* * This should never happen. */ log_error("watchdog failed to fire in ls %s:%d", ls_names[i], ls_hostids[i]); log_error("resetting_begin_local %llu now %llu " "resetting_begin_timestamp %llu timestamp %llu " "native_timeout %d native_renewal %d " "ls %s:%d", (unsigned long long)ls_resetting_begin_local[i], (unsigned long long)now, (unsigned long long)ls_resetting_begin_timestamp[i], (unsigned long long)ls_timestamp[i], native_timeout, native_renewal, ls_names[i], ls_hostids[i]); watchdog_failed_to_fire = 1; } else { log_info("reset done by native_timeout in ls %s:%d", ls_names[i], ls_hostids[i]); is_done = 1; } } else { log_debug("native timeout seconds remaining %d in ls %s:%d", native_timeout - (int)(now - ls_resetting_begin_local[i]), ls_names[i], ls_hostids[i]); } } if (watchdog_failed_to_fire) return 0; check_host_status: /* * The host status check. * * The lockspace behavior is different when resource leases * are not used to protect storage, so the conditions to check * depend on the --with-resources option. * * With resource leases, a host is safe/done when either it * is DEAD in any one lockspace (its watchdog has fired). * * Without resource leases, the loss of lockspace storage will * cause the lockspace to cleanly exit immediately. Because of * this, the DEAD state of the delta lease alone is not helpful. * * However, if we get a RESETTING reply, it means sanlk-resetd * on the destination has prevented the lockspace from exiting * due to lost storage. This means that the DEAD state of the * host will imply that the host's watchdog fired. * * with resource leases: if the host is DEAD in any lockspace, * reset is done because the watchdog fired. * * without resource leases: if the host replied and is DEAD in * any lockspace, reset is done because the watchdog fired. */ for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; ls_is_done = 0; state = ls_host_flags[i] & SANLK_HOST_MASK; if (state == SANLK_HOST_DEAD && !ls_is_dead[i]) { ls_is_dead[i] = 1; log_info("host dead in ls %s:%d", ls_names[i], ls_hostids[i]); } if (state == SANLK_HOST_FREE && !ls_is_free[i]) { ls_is_free[i] = 1; log_info("host free in ls %s:%d", ls_names[i], ls_hostids[i]); } if (resource_mode && ls_is_dead[i]) { ls_is_done = 1; is_done = 1; } if (!resource_mode && ls_is_dead[i] && ls_is_resetting[i]) { ls_is_done = 1; is_done = 1; } if (ls_is_done) log_info("reset done by host_status in ls %s:%d", ls_names[i], ls_hostids[i]); } return is_done; } static void get_events(int i) { struct sanlk_host_event from_he; uint64_t from_host, from_gen; int resetting = 0; int rebooting = 0; int rv; while (1) { rv = sanlock_get_event(ls_fd[i], 0, &from_he, &from_host, &from_gen); if (rv == -EAGAIN) break; if (rv < 0) { log_error("unregister fd %d get_event error %d ls %s", ls_fd[i], rv, ls_names[i]); unregister_ls(i); break; } log_debug("got event %llx %llx from host %llu %llu in ls %s:%d", (unsigned long long)from_he.event, (unsigned long long)from_he.data, (unsigned long long)from_host, (unsigned long long)from_gen, ls_names[i], ls_hostids[i]); resetting = from_he.event & EVENT_RESETTING; rebooting = from_he.event & EVENT_REBOOTING; if ((from_host == ls_hostids[i]) && (resetting || rebooting)) { log_info("host %s%sin ls %s:%d", resetting ? "resetting " : "", rebooting ? "rebooting " : "", ls_names[i], ls_hostids[i]); if (resetting) ls_is_resetting[i] = 1; } } } static int update_local_daemon(char *cmd) { char buf[UPDATE_SIZE]; int rv, i, s; s = setup_resetd_socket(); if (s < 0) { fprintf(stderr, "Failed to create socket %d\n", s); return EXIT_FAILURE; } for (i = 0; i < ls_count; i++) { memset(buf, 0, sizeof(buf)); snprintf(buf, UPDATE_SIZE, "%s %s", cmd, ls_names[i]); rv = sendto(s, buf, UPDATE_SIZE, 0, (struct sockaddr *)&update_addr, update_addrlen); if (rv < 0) { printf("Failed to update local sanlk-resetd: %s\n", strerror(errno)); close(s); return EXIT_FAILURE; } else { printf("Updated %s %s\n", cmd, ls_names[i]); } } close(s); return EXIT_SUCCESS; } static void usage(void) { printf("%s [options] reg|end|clear|reset lockspaces\n", prog_name); printf(" --help | -h\n"); printf(" Show this help information.\n"); printf(" --version | -V\n"); printf(" Show version.\n"); printf(" --debug-mode | -D\n"); printf(" Log debugging information.\n"); printf("\n"); printf("Update the local sanlk-resetd to watch lockspaces for reset events:\n"); printf("%s reg lockspace_name ...\n", prog_name); printf("\n"); printf("Update the local sanlk-resetd to not watch lockspaces for reset events:\n"); printf("%s end lockspace_name ...\n", prog_name); printf("\n"); printf("Update the local sanlk-resetd to clear all lockspaces being watched:\n"); printf("%s clear all\n", prog_name); printf("\n"); printf("Reset another host through a lockspace it is watching:\n"); printf("%s reset lockspace_name:host_id ...\n", prog_name); printf("\n"); printf(" --host-id | -i \n"); printf(" Host id to reset.\n"); printf("\n"); printf(" --generation | -g \n"); printf(" Generation of host id (default 0 for current generation).\n"); printf("\n"); printf(" --watchdog | -w 0|1\n"); printf(" Disable (0) use of wdmd/watchdog for testing.\n"); printf("\n"); printf(" --sysrq-reboot | -b 0|1\n"); printf(" Enable/Disable (1/0) use of /proc/sysrq-trigger to reboot (default 0).\n"); printf("\n"); printf(" --resource-mode | -R 0|1\n"); printf(" Resource leases are used (1) or not used (0) to protect storage.\n"); printf("\n"); printf(" --native-timeout | -t \n"); printf(" Disable native timeout by setting to 0.\n"); #if 0 printf(" Reset completion is calculated natively and is faster than\n"); printf(" waiting for the sanlock host status. Set to 0 to disable.\n"); printf(" (default %d, using a lower value can produce invalid result.)\n", NATIVE_TIMEOUT_SECONDS); #endif printf("\n"); printf(" The event will be set in each lockspace_name (max %d).\n", MAX_LS); printf(" The -i and -g options can only be used with a single lockspace_name arg.\n"); printf("\n"); } int main(int argc, char *argv[]) { char *ls_name, *colon, *cmd; struct sanlk_host_event he; uint32_t flags = 0; int i, fd, rv; int done = 0; int fail = 0; prog_name = argv[0]; begin = monotime(); memset(&he, 0, sizeof(he)); if (argc < 2) { usage(); exit(EXIT_USAGE); } static struct option long_options[] = { {"help", no_argument, 0, 'h' }, {"version", no_argument, 0, 'V' }, {"host-id", required_argument, 0, 'i' }, {"generation", required_argument, 0, 'g' }, {"watchdog", required_argument, 0, 'w' }, {"sysrq-reboot", required_argument, 0, 'b' }, {"resource-mode", required_argument, 0, 'R' }, {"native-timeout", required_argument, 0, 't' }, {"debug-mode", no_argument, 0, 'D' }, {0, 0, 0, 0 } }; while (1) { int c; int option_index = 0; c = getopt_long(argc, argv, "hVi:g:w:b:R:D", long_options, &option_index); if (c == -1) break; switch (c) { case '0': break; case 'h': usage(); exit(EXIT_SUCCESS); case 'V': printf("%s version: " VERSION "\n", prog_name); exit(EXIT_SUCCESS); case 'i': target_host_id = atoi(optarg); break; case 'g': target_generation = strtoull(optarg, NULL, 0); break; case 'w': use_watchdog = atoi(optarg); break; case 'b': use_sysrq_reboot = atoi(optarg); break; case 'R': resource_mode = atoi(optarg); break; case 't': if (!atoi(optarg)) native_timeout = 0; #if 0 if (native_timeout > NATIVE_VERIFY_SECONDS) native_renewal = native_timeout - NATIVE_VERIFY_SECONDS; #endif break; case 'D': debug_mode = 1; break; case '?': default: usage(); exit(EXIT_USAGE); } } if (optind >= argc) { fprintf(stderr, "command is required\n"); exit(2); } cmd = argv[optind]; optind++; ls_count = 0; for (i = optind; i < argc; i++) { if (ls_count == MAX_LS) { fprintf(stderr, "too many lockspaces (max %d)\n", MAX_LS); exit(2); } else { ls_names[ls_count] = argv[i]; ls_count++; } } if (!ls_count) { fprintf(stderr, "lockspace_name is required\n"); exit(EXIT_USAGE); } /* * Update local sanlk-resetd. */ if (!strcmp(cmd, "reg") || !strcmp(cmd, "end") || !strcmp(cmd, "clear")) { return update_local_daemon(cmd); } /* * Reset another host. */ if (strcmp(cmd, "reset")) { fprintf(stderr, "unknown command\n"); exit(EXIT_USAGE); } if ((ls_count > 1) && (target_host_id || target_generation)) { fprintf(stderr, "-i and -g options are only allowed with a single lockspace_name\n"); exit(EXIT_USAGE); } for (i = 0; i < ls_count; i++) { ls_name = ls_names[i]; colon = strstr(ls_name, ":"); if (!colon) { ls_hostids[i] = target_host_id; } else { ls_hostids[i] = atoi(colon+1); *colon = '\0'; } if (ls_hostids[i] < 1 || ls_hostids[i] > 2000) { fprintf(stderr, "invalid host_id %d", ls_hostids[i]); exit(EXIT_USAGE); } } openlog(prog_name, LOG_CONS | LOG_PID, LOG_DAEMON); pollfd = malloc(MAX_LS * sizeof(struct pollfd)); if (!pollfd) return -ENOMEM; for (i = 0; i < MAX_LS; i++) { ls_fd[i] = -1; pollfd[i].fd = -1; pollfd[i].events = 0; pollfd[i].revents = 0; } ls_count = 0; for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; fd = sanlock_reg_event(ls_names[i], NULL, 0); if (fd < 0) { log_error("reg_event error %d ls %s", fd, ls_names[i]); ls_names[i] = NULL; } else { ls_fd[i] = fd; pollfd[i].fd = ls_fd[i]; pollfd[i].events = POLLIN; ls_count++; } } if (!ls_count) { log_error("No lockspaces could be registered."); exit(EXIT_FAILURE); } if (use_watchdog) he.event |= EVENT_RESET; if (use_sysrq_reboot) he.event |= EVENT_REBOOT; for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; if (ls_fd[i] == -1) continue; /* a host can have different host_ids in different lockspaces */ he.host_id = ls_hostids[i]; he.generation = target_generation; flags = target_generation ? SANLK_SETEV_CUR_GENERATION : 0; rv = sanlock_set_event(ls_names[i], &he, flags); if (rv < 0) { log_error("set_event error %d ls %s", rv, ls_names[i]); unregister_ls(i); } else { log_debug("set event %llx %llx for host %llu %llu in ls %s:%d", (unsigned long long)he.event, (unsigned long long)he.data, (unsigned long long)he.host_id, (unsigned long long)he.generation, ls_names[i], ls_hostids[i]); log_info("asked host to %s%sin ls %s:%d", (he.event & EVENT_RESET) ? "reset " : "", (he.event & EVENT_REBOOT) ? "reboot " : "", ls_names[i], ls_hostids[i]); } } if (!ls_count) { log_error("Event could not be set in any lockspace."); exit(EXIT_FAILURE); } while (1) { rv = poll(pollfd, MAX_LS, 2000); if (rv == -1 && errno == EINTR) continue; if (rv < 0) break; done = reset_done(); if (done) break; fail = reset_fail(); if (fail) break; for (i = 0; i < MAX_LS; i++) { if (pollfd[i].fd < 0) continue; if (pollfd[i].revents & POLLIN) get_events(i); if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { log_debug("unregister fd %d poll %x ls %s", ls_fd[i], pollfd[i].revents, ls_names[i]); unregister_ls(i); } } } for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; if (ls_fd[i] == -1) continue; unregister_ls(i); } if (done) { log_info("reset done in %u seconds", (uint32_t)(monotime() - begin)); exit(EXIT_SUCCESS); } else { log_error("reset failed in %u seconds", (uint32_t)(monotime() - begin)); exit(EXIT_FAILURE); } } sanlock-3.8.2/reset/sanlk_reset.h000066400000000000000000000020401371427612200167640ustar00rootroot00000000000000/* * Copyright 2014 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __SANLK_RESET_H__ #define __SANLK_RESET_H__ #define EVENT_RESET 1 #define EVENT_RESETTING 2 #define EVENT_REBOOT 4 #define EVENT_REBOOTING 8 #define SANLK_RESETD_RUNDIR "/run/sanlk-resetd" #define SANLK_RESETD_SOCKET SANLK_RESETD_RUNDIR "/sanlk-resetd.sock" #define SANLK_RESETD_SOCKET_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) #define UPDATE_SIZE 256 /* sendmsg size on unix socket */ static inline int setup_resetd_socket(void) { int s; s = socket(AF_LOCAL, SOCK_DGRAM, 0); if (s < 0) return s; memset(&update_addr, 0, sizeof(update_addr)); update_addr.sun_family = AF_LOCAL; strcpy(update_addr.sun_path, SANLK_RESETD_SOCKET); update_addrlen = sizeof(sa_family_t) + strlen(update_addr.sun_path) + 1; return s; } #endif sanlock-3.8.2/reset/sanlk_resetd.c000066400000000000000000000410761371427612200171370ustar00rootroot00000000000000/* * Copyright 2014 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" #include "sanlock_direct.h" #include "wdmd.h" static struct sockaddr_un update_addr; static socklen_t update_addrlen; #include "sanlk_reset.h" #define DEFAULT_SYSRQ_DELAY 25 static char *daemon_name = (char *)"sanlk-resetd"; static int daemon_quit; static int daemon_foreground; static int daemon_debug; static int poll_timeout; static int resource_mode; static int use_watchdog = 1; static int use_sysrq_reboot = 0; static int sysrq_delay = DEFAULT_SYSRQ_DELAY; static int we_are_resetting; static int we_are_rebooting; static int wd_reset_failed; static uint64_t rebooting_time; #define MAX_LS 64 #define POLLFD_COUNT (MAX_LS+2) #define SIGNAL_INDEX (MAX_LS) #define UPDATE_INDEX (MAX_LS+1) static char *ls_names[MAX_LS]; static int ls_fd[MAX_LS]; static int ls_count; static struct pollfd *pollfd; static int update_fd; static int signal_fd; static int wdmd_fd; #define log_debug(fmt, args...) \ do { \ if (daemon_debug) \ fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ log_debug(fmt, ##args); \ syslog(LOG_ERR, fmt, ##args); \ } while (0) #define log_warn(fmt, args...) \ do { \ log_debug(fmt, ##args); \ syslog(LOG_WARNING, fmt, ##args); \ } while (0) #define log_notice(fmt, args...) \ do { \ log_debug(fmt, ##args); \ syslog(LOG_NOTICE, fmt, ##args); \ } while (0) static uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } /* * By default a 25 second delay is used before using sysrq to give sanlock * time to write our resetting event in its next lease renewal. * * It would not be surprising for sysrq reboot to fail or hang, so it's * important for the watchdog to also be there to reset us. This * sysrq reboot is used only as a way to speed up the reset since the * watchdog requires 60 seconds to fire. */ static void sysrq_reboot(void) { int fd, rv; log_notice("Rebooting host with sysrq"); /* give at least a small chance for the log message to be written */ sleep(1); fd = open("/proc/sysrq-trigger", O_WRONLY); if (fd < 0) { log_error("failed to open sysrq-trigger %d %d", fd, errno); return; } rv = write(fd, "b", 1); if (rv < 0) { log_error("failed to write sysrq-trigger %d %d", rv, errno); } close(fd); /* If sysrq reboot worked, then I don't think we will get here. */ /* If sysrq reboot failed, then the watchdog should reset us. */ log_error("Reboot from sysrq is expected"); } /* * Use the watchdog to reset the machine as soon as possible. * Intentionally set the expire time on the connection to * the current time so that the watchdog will expire and * reset as soon as possible. */ static int watchdog_reset_self(void) { uint64_t now; int rv; if (!use_watchdog) return 0; now = monotime(); rv = wdmd_test_live(wdmd_fd, now, now); if (rv < 0) { log_error("watchdog_reset_self test_live failed %d", rv); return rv; } log_notice("Resetting host with watchdog"); return 0; } static int setup_wdmd(void) { char name[WDMD_NAME_SIZE]; int con; int rv; if (!use_watchdog) return 0; con = wdmd_connect(); if (con < 0) { log_error("setup_wdmd connect failed %d", con); return con; } memset(name, 0, sizeof(name)); snprintf(name, WDMD_NAME_SIZE - 1, "sanlk-resetd"); rv = wdmd_register(con, name); if (rv < 0) { log_error("setup_wdmd register failed %d", rv); goto fail_close; } /* the refcount tells wdmd that it should not cleanly exit */ rv = wdmd_refcount_set(con); if (rv < 0) { log_error("setup_wdmd refcount_set failed %d", rv); goto fail_close; } log_debug("setup_wdmd %d", con); wdmd_fd = con; return 0; fail_close: close(con); return -1; } static void close_wdmd(void) { if (!use_watchdog) return; wdmd_refcount_clear(wdmd_fd); close(wdmd_fd); } /* * This event will be included in the next lease renewal of the lockspace. * This should be within the next 20 seconds, unless renewals are * experiencing some delays. We have about 60 seconds to get the renewal, * including the event, written before the watchdog fires (or syrq_delay until * sysrq reboot if that is configured). */ static void set_event_out(char *ls_name, uint64_t event_out, uint64_t from_host, uint64_t from_gen) { struct sanlk_host_event he; int rv; he.host_id = from_host; he.generation = from_gen; he.event = event_out; he.data = 0; log_notice("set reply %s%s(%llx %llx) for host %llu %llu ls %s", (event_out & EVENT_RESETTING) ? "resetting " : "", (event_out & EVENT_REBOOTING) ? "rebooting " : "", (unsigned long long)he.event, (unsigned long long)he.data, (unsigned long long)from_host, (unsigned long long)from_gen, ls_name); rv = sanlock_set_event(ls_name, &he, SANLK_SETEV_ALL_HOSTS); if (rv < 0) log_error("set_event error %d ls %s", rv, ls_name); } static int find_ls(char *name) { int i; for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; if (!strcmp(name, ls_names[i])) return i; } return -1; } static int register_ls(int i) { int fd; if (!ls_names[i]) return -ENOMEM; fd = sanlock_reg_event(ls_names[i], NULL, 0); if (fd < 0) { log_error("reg_event %d error %d ls %s", i, fd, ls_names[i]); free(ls_names[i]); ls_names[i] = NULL; return fd; } else { log_debug("reg_event %d fd %d ls %s", i, fd, ls_names[i]); ls_fd[i] = fd; pollfd[i].fd = fd; pollfd[i].events = POLLIN; ls_count++; return 0; } } static void unregister_ls(int i) { log_debug("end_event %d fd %d ls %s", i, ls_fd[i], ls_names[i]); sanlock_end_event(ls_fd[i], ls_names[i], 0); free(ls_names[i]); ls_names[i] = NULL; ls_fd[i] = -1; pollfd[i].fd = -1; pollfd[i].events = 0; ls_count--; } static void get_events(int i) { struct sanlk_host_event from_he; uint64_t from_host, from_gen; uint64_t event, event_out; int set_config_failed; int rv; while (1) { rv = sanlock_get_event(ls_fd[i], 0, &from_he, &from_host, &from_gen); if (rv == -EAGAIN) break; if (rv < 0) { log_error("unregister %d fd %d get_event error %d ls %s", i, ls_fd[i], rv, ls_names[i]); unregister_ls(i); break; } event = from_he.event; event_out = 0; set_config_failed = 0; if (event & (EVENT_RESET | EVENT_REBOOT)) { log_notice("request to %s%s(%llx %llx) from host %llu %llu ls %s", (event & EVENT_RESET) ? "reset " : "", (event & EVENT_REBOOT) ? "reboot " : "", (unsigned long long)from_he.event, (unsigned long long)from_he.data, (unsigned long long)from_host, (unsigned long long)from_gen, ls_names[i]); } if (event & (EVENT_RESETTING | EVENT_REBOOTING)) { log_notice("notice of %s%s(%llx %llx) from host %llu %llu ls %s", (event & EVENT_RESETTING) ? "resetting " : "", (event & EVENT_REBOOTING) ? "rebooting " : "", (unsigned long long)from_he.event, (unsigned long long)from_he.data, (unsigned long long)from_host, (unsigned long long)from_gen, ls_names[i]); } if ((event & EVENT_REBOOT) && !use_sysrq_reboot) { event &= ~EVENT_REBOOT; log_error("ignore reboot request sysrq_reboot not enabled"); } if ((event & EVENT_RESET) && !resource_mode) { /* prevent lockspaces from cleanly exiting from lost storage, if this cannot be done, then do not set_event_out. */ rv = sanlock_set_config(ls_names[i], 0, SANLK_CONFIG_USED, NULL); if (rv < 0) { log_error("sanlock_set_config error %d ls %s", rv, ls_names[i]); set_config_failed = 1; } } if ((event & EVENT_RESET) && !we_are_resetting) { we_are_resetting = 1; poll_timeout = 1000; wd_reset_failed = watchdog_reset_self(); } if ((event & EVENT_REBOOT) && !we_are_rebooting) { we_are_rebooting = 1; poll_timeout = 1000; rebooting_time = monotime(); } /* * We attempt to reply to reset requests in any lockspace * where we get one, even though we initiate the reset only * the first time we get the request. The first lockspace * through which we get the request is most likely to get * our reply. Our reply through subsequent lockspaces are * less likely to have time to be written out before the * reset/reboot actually occur. * * Our resetting reply is addressed to all hosts. Multiple * hosts could ask us to reset, and all will get the reply * to the first we receive. */ if (we_are_resetting && !wd_reset_failed) event_out |= EVENT_RESETTING; if (we_are_rebooting) event_out |= EVENT_REBOOTING; if (event_out && !set_config_failed) { set_event_out(ls_names[i], event_out, from_host, from_gen); /* No further events from this lockspace are useful. */ pollfd[i].fd = -1; pollfd[i].events = 0; return; } } } static int setup_signals(void) { sigset_t mask; int rv; sigemptyset(&mask); sigaddset(&mask, SIGTERM); sigaddset(&mask, SIGINT); sigaddset(&mask, SIGHUP); rv = sigprocmask(SIG_BLOCK, &mask, NULL); if (rv < 0) return rv; signal_fd = signalfd(-1, &mask, 0); if (signal_fd < 0) return -errno; return 0; } static void process_signal(int fd) { struct signalfd_siginfo fdsi; ssize_t rv; rv = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); if (rv != sizeof(struct signalfd_siginfo)) return; if ((fdsi.ssi_signo == SIGTERM) || (fdsi.ssi_signo == SIGINT)) { log_debug("daemon_quit signal %d", fdsi.ssi_signo); daemon_quit = 1; } } static int setup_update(void) { int s, rv; rv = mkdir(SANLK_RESETD_RUNDIR, 0755); if (rv < 0 && errno != EEXIST) return rv; s = setup_resetd_socket(); if (s < 0) return s; unlink(update_addr.sun_path); rv = bind(s, (struct sockaddr *) &update_addr, update_addrlen); if (rv < 0) goto fail_close; rv = chmod(update_addr.sun_path, SANLK_RESETD_SOCKET_MODE); if (rv < 0) goto fail_close; update_fd = s; return 0; fail_close: close(s); return -1; } static void process_update(int fd) { char buf[UPDATE_SIZE]; char cmd[UPDATE_SIZE]; char name[UPDATE_SIZE]; int i, rv; memset(buf, 0, sizeof(buf)); memset(cmd, 0, sizeof(cmd)); memset(name, 0, sizeof(name)); rv = recvfrom(fd, buf, UPDATE_SIZE, MSG_DONTWAIT, (struct sockaddr *) &update_addr, &update_addrlen); if (!rv || rv < 0 || rv != UPDATE_SIZE) { log_debug("process_update recvfrom error %d %d", rv, errno); return; } buf[UPDATE_SIZE-1] = '\0'; rv = sscanf(buf, "%s %s", cmd, name); if (rv != 2) { log_debug("process_update ignore message %d", rv); return; } if (!strcmp(cmd, "reg")) { log_debug("process_update reg %s", name); /* if the name exists, end then reg */ i = find_ls(name); if (i > -1) { unregister_ls(i); ls_names[i] = strdup(name); register_ls(i); return; } for (i = 0; i < MAX_LS; i++) { if (ls_names[i]) continue; ls_names[i] = strdup(name); register_ls(i); return; } } else if (!strcmp(cmd, "end")) { log_debug("process_update end %s", name); i = find_ls(name); if (i > -1) { unregister_ls(i); return; } } else if (!strcmp(cmd, "clear")) { log_debug("process_update clear %s", name); for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; unregister_ls(i); } } else { log_debug("process_update cmd unknown"); } } static void usage(void) { printf("%s [options] lockspace_name ...\n", daemon_name); printf(" --help | -h\n"); printf(" Show this help information.\n"); printf(" --version | -V\n"); printf(" Show version.\n"); printf(" --foreground | -f\n"); printf(" Don't fork.\n"); printf(" --daemon-debug | -D\n"); printf(" Don't fork and print debugging to stdout.\n"); printf(" --watchdog | -w 0|1\n"); printf(" Disable (0) use of wdmd/watchdog for testing.\n"); printf(" --sysrq-reboot | -b 0|1\n"); printf(" Enable/Disable (1/0) use of /proc/sysrq-trigger to reboot (default 0).\n"); printf(" --sysrq-delay | -d \n"); printf(" Delay this many seconds before using /proc/sysrq-trigger (default %d).\n", DEFAULT_SYSRQ_DELAY); printf(" --resource-mode | -R 0|1\n"); printf(" Resource leases are used (1) or not used (0) to protect storage.\n"); printf("\n"); printf("Get reset events from lockspace_name (max %d).\n", MAX_LS); } int main(int argc, char *argv[]) { int ls_argc = 0; int i, rv; static struct option long_options[] = { {"help", no_argument, 0, 'h' }, {"version", no_argument, 0, 'V' }, {"foreground", no_argument, 0, 'f' }, {"daemon-debug", no_argument, 0, 'D' }, {"watchdog", required_argument, 0, 'w' }, {"sysrq-reboot", required_argument, 0, 'b' }, {"sysrq-delay", required_argument, 0, 'd' }, {"resource-mode", required_argument, 0, 'R' }, {0, 0, 0, 0 } }; while (1) { int c; int option_index = 0; c = getopt_long(argc, argv, "hVfDw:b:d:R:", long_options, &option_index); if (c == -1) break; switch (c) { case '0': break; case 'h': usage(); exit(EXIT_SUCCESS); case 'V': printf("%s version: " VERSION "\n", daemon_name); exit(EXIT_SUCCESS); case 'f': daemon_foreground = 1; break; case 'D': daemon_foreground = 1; daemon_debug = 1; break; case 'R': resource_mode = atoi(optarg); break; case 'w': use_watchdog = atoi(optarg); break; case 'b': use_sysrq_reboot = atoi(optarg); break; case 'd': sysrq_delay = atoi(optarg); break; case '?': default: usage(); exit(EXIT_FAILURE); } } for (i = optind; i < argc; i++) { if (ls_argc == MAX_LS) { fprintf(stderr, "ignore lockspace_name %s", argv[i]); continue; } ls_names[ls_argc] = strdup(argv[i]); ls_argc++; } if (!daemon_foreground) { if (daemon(0, 0) < 0) { fprintf(stderr, "cannot fork daemon\n"); exit(EXIT_FAILURE); } } openlog(daemon_name, LOG_CONS | LOG_PID, LOG_DAEMON); log_notice("%s %s started %s", daemon_name, VERSION, use_watchdog ? "" : "use_watchdog=0"); rv = setup_wdmd(); if (rv < 0) { log_error("failed to set up wdmd"); return rv; } rv = setup_signals(); if (rv < 0) { log_error("failed to set up signal fd"); goto out; } rv = setup_update(); if (rv < 0) { log_error("failed to set up update fd"); goto out; } /* * MAX_LS+2: MAX_LS fd's for lockspace, 1 fd for signal_fd, 1 fd for update_fd. */ pollfd = malloc(POLLFD_COUNT * sizeof(struct pollfd)); if (!pollfd) return -ENOMEM; memset(pollfd, 0, POLLFD_COUNT * sizeof(struct pollfd)); for (i = 0; i < POLLFD_COUNT; i++) pollfd[i].fd = -1; pollfd[SIGNAL_INDEX].fd = signal_fd; pollfd[SIGNAL_INDEX].events = POLLIN; pollfd[UPDATE_INDEX].fd = update_fd; pollfd[UPDATE_INDEX].events = POLLIN; /* * register with sanlock for each initial lockspace */ for (i = 0; i < MAX_LS; i++) ls_fd[i] = -1; for (i = 0; i < ls_argc; i++) register_ls(i); poll_timeout = -1; while (1) { rv = poll(pollfd, POLLFD_COUNT, poll_timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) break; if (pollfd[SIGNAL_INDEX].revents & POLLIN) process_signal(pollfd[SIGNAL_INDEX].fd); if (pollfd[UPDATE_INDEX].revents & POLLIN) process_update(pollfd[UPDATE_INDEX].fd); if (pollfd[UPDATE_INDEX].revents & (POLLERR | POLLHUP | POLLNVAL)) { close(update_fd); pollfd[UPDATE_INDEX].fd = -1; pollfd[UPDATE_INDEX].events = 0; pollfd[UPDATE_INDEX].revents = 0; } if (daemon_quit) break; if (we_are_rebooting && (monotime() - rebooting_time >= sysrq_delay)) { sysrq_reboot(); } for (i = 0; i < MAX_LS; i++) { if (pollfd[i].revents & POLLIN) get_events(i); if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { log_debug("unregister %d ls_fd %d pollfd %d %x %x ls %s", i, ls_fd[i], pollfd[i].fd, pollfd[i].events, pollfd[i].revents, ls_names[i]); unregister_ls(i); } } } log_debug("unregister daemon_quit=%d ls_count=%d", daemon_quit, ls_count); for (i = 0; i < MAX_LS; i++) { if (!ls_names[i]) continue; if (ls_fd[i] == -1) continue; unregister_ls(i); } out: close_wdmd(); return 0; } sanlock-3.8.2/sanlock.spec.in000066400000000000000000000134751371427612200161100ustar00rootroot00000000000000# NOTE: this spec should be used only for developmemnt. # # To find the real sanlock spec use: # - Fedora: fedpkg clone sanlock # - RHEL: rhpkg clone sanlock # # This spec was created from Fedora spec and modified to work on CentOS. Name: sanlock Version: @VERSION@ Release: @RELEASE@%{?dist} Summary: A shared storage lock manager License: GPLv2 and GPLv2+ and LGPLv2+ URL: https://pagure.io/sanlock/ BuildRequires: gcc BuildRequires: libaio-devel BuildRequires: libblkid-devel BuildRequires: libuuid-devel BuildRequires: make BuildRequires: python3 BuildRequires: python3-devel BuildRequires: systemd-units Requires: %{name}-lib = %{version}-%{release} Requires(pre): /usr/sbin/groupadd Requires(pre): /usr/sbin/useradd Requires(post): systemd-units Requires(post): systemd-sysv Requires(preun): systemd-units Requires(postun): systemd-units Source0: https://releases.pagure.org/sanlock/%{name}-%{version}.tar.gz %global python_package python3-%{name} %description The sanlock daemon manages leases for applications on hosts using shared storage. %prep %setup -q %build %if 0%{?fedora} %set_build_flags %endif # upstream does not require configure # upstream does not support _smp_mflags CFLAGS=$RPM_OPT_FLAGS make -C wdmd CFLAGS=$RPM_OPT_FLAGS make -C src CFLAGS=$RPM_OPT_FLAGS make -C python PY_VERSION=3 CFLAGS=$RPM_OPT_FLAGS make -C reset %install rm -rf $RPM_BUILD_ROOT make -C src \ install LIBDIR=%{_libdir} \ DESTDIR=$RPM_BUILD_ROOT make -C wdmd \ install LIBDIR=%{_libdir} \ DESTDIR=$RPM_BUILD_ROOT make -C python \ install LIBDIR=%{_libdir} \ DESTDIR=$RPM_BUILD_ROOT \ PY_VERSION=3 make -C reset \ install LIBDIR=%{_libdir} \ DESTDIR=$RPM_BUILD_ROOT install -D -m 0644 init.d/sanlock.service.native $RPM_BUILD_ROOT/%{_unitdir}/sanlock.service install -D -m 0755 init.d/wdmd $RPM_BUILD_ROOT/usr/lib/systemd/systemd-wdmd install -D -m 0644 init.d/wdmd.service.native $RPM_BUILD_ROOT/%{_unitdir}/wdmd.service install -D -m 0644 init.d/sanlk-resetd.service $RPM_BUILD_ROOT/%{_unitdir}/sanlk-resetd.service install -D -m 0644 src/logrotate.sanlock \ $RPM_BUILD_ROOT/etc/logrotate.d/sanlock install -D -m 0644 src/sanlock.conf \ $RPM_BUILD_ROOT/etc/sanlock/sanlock.conf install -D -m 0644 init.d/wdmd.sysconfig \ $RPM_BUILD_ROOT/etc/sysconfig/wdmd install -Dd -m 0755 $RPM_BUILD_ROOT/etc/wdmd.d install -Dd -m 0775 $RPM_BUILD_ROOT/%{_rundir}/sanlock install -Dd -m 0775 $RPM_BUILD_ROOT/%{_rundir}/sanlk-resetd %pre getent group sanlock > /dev/null || /usr/sbin/groupadd \ -g 179 sanlock getent passwd sanlock > /dev/null || /usr/sbin/useradd \ -u 179 -c "sanlock" -s /sbin/nologin -r \ -g 179 -d /run/sanlock sanlock /usr/sbin/usermod -a -G disk sanlock %post %systemd_post wdmd.service sanlock.service %preun %systemd_preun wdmd.service sanlock.service %postun %systemd_postun wdmd.service sanlock.service %files /usr/lib/systemd/systemd-wdmd %{_unitdir}/sanlock.service %{_unitdir}/wdmd.service %{_sbindir}/sanlock %{_sbindir}/wdmd %dir %{_sysconfdir}/wdmd.d %dir %{_sysconfdir}/sanlock %dir %attr(-,sanlock,sanlock) %{_rundir}/sanlock %{_mandir}/man8/wdmd* %{_mandir}/man8/sanlock* %config(noreplace) %{_sysconfdir}/logrotate.d/sanlock %config(noreplace) %{_sysconfdir}/sanlock/sanlock.conf %config(noreplace) %{_sysconfdir}/sysconfig/wdmd %doc init.d/sanlock %doc init.d/sanlock.service %doc init.d/wdmd.service %package lib Summary: A shared storage lock manager library %description lib The %{name}-lib package contains the runtime libraries for sanlock, a shared storage lock manager. Hosts connected to a common SAN can use this to synchronize their access to the shared disks. %ldconfig_scriptlets lib %files lib %{_libdir}/libsanlock.so.* %{_libdir}/libsanlock_client.so.* %{_libdir}/libwdmd.so.* %package -n %{python_package} Summary: Python bindings for the sanlock library Requires: %{name}-lib = %{version}-%{release} %if 0%{?fedora} %{?python_provide:%python_provide %{python_package}} %endif # fedora %description -n %{python_package} The %{python_package} package contains a module that permits applications written in the Python programming language to use the interface supplied by the sanlock library. %files -n %{python_package} %{python3_sitearch}/sanlock_python-*.egg-info %{python3_sitearch}/sanlock*.so %package devel Summary: Development files for %{name} Requires: %{name}-lib = %{version}-%{release} %description devel The %{name}-devel package contains libraries and header files for developing applications that use %{name}. %files devel %{_libdir}/libwdmd.so %{_includedir}/wdmd.h %{_libdir}/libsanlock.so %{_libdir}/libsanlock_client.so %{_includedir}/sanlock.h %{_includedir}/sanlock_rv.h %{_includedir}/sanlock_admin.h %{_includedir}/sanlock_resource.h %{_includedir}/sanlock_direct.h %{_libdir}/pkgconfig/libsanlock.pc %{_libdir}/pkgconfig/libsanlock_client.pc %package -n sanlk-reset Summary: Host reset daemon and client using sanlock Requires: sanlock = %{version}-%{release} Requires: sanlock-lib = %{version}-%{release} %description -n sanlk-reset The sanlk-reset package contains the reset daemon and client. A cooperating host running the daemon can be reset by a host running the client, so long as both maintain access to a common sanlock lockspace. %files -n sanlk-reset %{_sbindir}/sanlk-reset %{_sbindir}/sanlk-resetd %{_unitdir}/sanlk-resetd.service %dir %attr(-,root,root) %{_rundir}/sanlk-resetd %{_mandir}/man8/sanlk-reset* %changelog * Wed Jun 12 2019 Nir Soffer - 3.8.0-1 - Convert spec to python 3 * Mon Mar 25 2019 Nir Soffer - 3.7.0-1 - Import spec from Fedora master branch (371e11a) - Fix spec to work on CentOS (20efe91) sanlock-3.8.2/src/000077500000000000000000000000001371427612200137525ustar00rootroot00000000000000sanlock-3.8.2/src/Makefile000066400000000000000000000077721371427612200154270ustar00rootroot00000000000000# Copyright 2010-2011 Red Hat, Inc. # # This copyrighted material is made available to anyone wishing to use, # modify, copy, or redistribute it subject to the terms and conditions # of the GNU General Public License v2 or (at your option) any later version. include ../common.mk CMD_TARGET = sanlock HEADER_TARGET = sanlock.h sanlock_rv.h sanlock_resource.h sanlock_admin.h sanlock_direct.h MAN_TARGET = sanlock.8 SOMAJOR=1 SOMINOR=0 LIB_ENTIRE_TARGET = libsanlock LIB_CLIENT_TARGET = libsanlock_client LIBPC_ENTIRE_TARGET = libsanlock.pc LIBPC_CLIENT_TARGET = libsanlock_client.pc LIBSO_ENTIRE_TARGET = $(LIB_ENTIRE_TARGET).so.$(SOMAJOR).$(SOMINOR) LIBSO_CLIENT_TARGET = $(LIB_CLIENT_TARGET).so.$(SOMAJOR).$(SOMINOR) CMD_SOURCE = \ crc32c.c \ delta_lease.c \ direct.c \ diskio.c \ ondisk.c \ sizeflags.c \ helper.c \ lockspace.c \ lockfile.c \ log.c \ main.c \ paxos_lease.c \ task.c \ timeouts.c \ resource.c \ rindex.c \ watchdog.c \ monotime.c \ cmd.c \ client_cmd.c \ sanlock_sock.c \ env.c LIB_ENTIRE_SOURCE = \ client.c \ sanlock_sock.c \ crc32c.c \ diskio.c \ ondisk.c \ sizeflags.c \ delta_lease.c \ paxos_lease.c \ rindex.c \ direct.c \ task.c \ timeouts.c \ direct_lib.c \ monotime.c \ env.c LIB_CLIENT_SOURCE = \ client.c \ sanlock_sock.c \ env.c LIBPC_ENTIRE_SOURCE = libsanlock.pc.in LIBPC_CLIENT_SOURCE = libsanlock_client.pc.in VER=$(shell cat ../VERSION) CFLAGS += -DVERSION=\"$(VER)\" CMD_CFLAGS = $(CFLAGS) -fPIE -DPIE LIB_ENTIRE_CFLAGS = $(CFLAGS) -fPIC LIB_CLIENT_CFLAGS = $(CFLAGS) -fPIC CMD_LDFLAGS = $(LDFLAGS) -Wl,-z,relro -pie LIB_ENTIRE_LDFLAGS = $(LDFLAGS) -Wl,-z,relro -pie LIB_CLIENT_LDFLAGS = $(LDFLAGS) -Wl,-z,relro -pie CMD_LDADD = -lpthread -luuid -lrt -laio -lblkid -lsanlock -L../wdmd -lwdmd LIB_ENTIRE_LDADD = -lpthread -lrt -laio -lblkid -L../wdmd -lwdmd all: $(LIBSO_ENTIRE_TARGET) $(LIBSO_CLIENT_TARGET) $(CMD_TARGET) $(LIBPC_ENTIRE_TARGET) $(LIBPC_CLIENT_TARGET) $(LIBSO_ENTIRE_TARGET): $(LIB_ENTIRE_SOURCE) $(CC) $(LIB_ENTIRE_CFLAGS) $(LIB_ENTIRE_LDFLAGS) -shared -o $@ -Wl,-soname=$(LIB_ENTIRE_TARGET).so.$(SOMAJOR) $^ $(LIB_ENTIRE_LDADD) ln -sf $(LIBSO_ENTIRE_TARGET) $(LIB_ENTIRE_TARGET).so ln -sf $(LIBSO_ENTIRE_TARGET) $(LIB_ENTIRE_TARGET).so.$(SOMAJOR) $(LIBSO_CLIENT_TARGET): $(LIB_CLIENT_SOURCE) $(CC) $(LIB_CLIENT_CFLAGS) $(LIB_CLIENT_LDFLAGS) -shared -o $@ -Wl,-soname=$(LIB_CLIENT_TARGET).so.$(SOMAJOR) $^ ln -sf $(LIBSO_CLIENT_TARGET) $(LIB_CLIENT_TARGET).so ln -sf $(LIBSO_CLIENT_TARGET) $(LIB_CLIENT_TARGET).so.$(SOMAJOR) $(CMD_TARGET): $(LIBSO_ENTIRE_TARGET) $(CMD_SOURCE) $(CC) $(CMD_CFLAGS) $(CMD_LDFLAGS) $(CMD_SOURCE) $(CMD_LDADD) -o $@ -L. $(LIBPC_ENTIRE_TARGET): $(LIBPC_ENTIRE_SOURCE) sed -e "s/@VERSION@/$(VER)/" $(LIBPC_ENTIRE_SOURCE) > $(LIBPC_ENTIRE_TARGET) $(LIBPC_CLIENT_TARGET): $(LIBPC_CLIENT_SOURCE) sed -e "s/@VERSION@/$(VER)/" $(LIBPC_CLIENT_SOURCE) > $(LIBPC_CLIENT_TARGET) clean: rm -f *.o *.so *.so.* $(CMD_TARGET) $(LIBSO_ENTIRE_TARGET) $(LIBSO_CLIENT_TARGET) $(LIBPC_ENTIRE_TARGET) $(LIBPC_CLIENT_TARGET) INSTALL=$(shell which install) DESTDIR= BINDIR=/usr/sbin LIBDIR=/usr/lib64 HEADIR=/usr/include MANDIR=/usr/share/man .PHONY: install install: all $(INSTALL) -d $(DESTDIR)/$(BINDIR) $(INSTALL) -d $(DESTDIR)/$(LIBDIR) $(INSTALL) -d $(DESTDIR)/$(HEADIR) $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 $(INSTALL) -d $(DESTDIR)/$(LIBDIR)/pkgconfig $(INSTALL) -c -m 755 $(CMD_TARGET) $(DESTDIR)/$(BINDIR) $(INSTALL) -c -m 755 $(LIBSO_ENTIRE_TARGET) $(DESTDIR)/$(LIBDIR) $(INSTALL) -c -m 755 $(LIBSO_CLIENT_TARGET) $(DESTDIR)/$(LIBDIR) $(INSTALL) -c -m 644 $(LIBPC_ENTIRE_TARGET) $(DESTDIR)/$(LIBDIR)/pkgconfig $(INSTALL) -c -m 644 $(LIBPC_CLIENT_TARGET) $(DESTDIR)/$(LIBDIR)/pkgconfig cp -a $(LIB_ENTIRE_TARGET).so $(DESTDIR)/$(LIBDIR) cp -a $(LIB_CLIENT_TARGET).so $(DESTDIR)/$(LIBDIR) cp -a $(LIB_ENTIRE_TARGET).so.$(SOMAJOR) $(DESTDIR)/$(LIBDIR) cp -a $(LIB_CLIENT_TARGET).so.$(SOMAJOR) $(DESTDIR)/$(LIBDIR) $(INSTALL) -c -m 644 $(HEADER_TARGET) $(DESTDIR)/$(HEADIR) $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8/ sanlock-3.8.2/src/client.c000066400000000000000000001266101371427612200154020ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_internal.h" #include "sanlock_resource.h" #include "sanlock_admin.h" #include "sanlock_sock.h" #include "sanlock_rv.h" #include "env.h" #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif static int connect_socket(int *sock_fd) { int rv, s; struct sockaddr_un addr; static const char *run_dir; *sock_fd = -1; s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) return -errno; if (run_dir == NULL) run_dir = env_get("SANLOCK_RUN_DIR", DEFAULT_RUN_DIR); rv = sanlock_socket_address(run_dir, &addr); if (rv < 0) { close(s); return rv; } rv = connect(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { rv = -errno; close(s); return rv; } *sock_fd = s; return 0; } static int send_header(int sock, int cmd, uint32_t cmd_flags, int datalen, uint32_t data, uint32_t data2) { struct sm_header header; int rv; memset(&header, 0, sizeof(header)); header.magic = SM_MAGIC; header.version = SM_PROTO; header.cmd = cmd; header.cmd_flags = cmd_flags; header.length = sizeof(header) + datalen; header.data = data; header.data2 = data2; retry: rv = send(sock, (void *) &header, sizeof(header), 0); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) return -errno; return 0; } static ssize_t send_data(int sockfd, const void *buf, size_t len, int flags) { ssize_t rv; retry: rv = send(sockfd, buf, len, flags); if (rv == -1 && errno == EINTR) goto retry; return rv; } static ssize_t recv_data(int sockfd, void *buf, size_t len, int flags) { ssize_t rv; retry: rv = recv(sockfd, buf, len, flags); if (rv == -1 && errno == EINTR) goto retry; return rv; } int send_command(int cmd, uint32_t data); int send_command(int cmd, uint32_t data) { int rv, sock; rv = connect_socket(&sock); if (rv < 0) return rv; rv = send_header(sock, cmd, 0, 0, data, 0); if (rv < 0) { close(sock); return rv; } return sock; } static int recv_result(int fd) { struct sm_header h; int rv; memset(&h, 0, sizeof(h)); retry: rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) return -errno; if (rv != sizeof(h)) return -1; return (int)h.data; } static int cmd_lockspace(int cmd, struct sanlk_lockspace *ls, uint32_t flags, uint32_t data) { int rv, fd; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, cmd, flags, sizeof(struct sanlk_lockspace), data, 0); if (rv < 0) goto out; rv = send_data(fd, (void *)ls, sizeof(struct sanlk_lockspace), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_add_lockspace(struct sanlk_lockspace *ls, uint32_t flags) { return cmd_lockspace(SM_CMD_ADD_LOCKSPACE, ls, flags, 0); } int sanlock_add_lockspace_timeout(struct sanlk_lockspace *ls, uint32_t flags, uint32_t io_timeout) { return cmd_lockspace(SM_CMD_ADD_LOCKSPACE, ls, flags, io_timeout); } int sanlock_inq_lockspace(struct sanlk_lockspace *ls, uint32_t flags) { return cmd_lockspace(SM_CMD_INQ_LOCKSPACE, ls, flags, 0); } int sanlock_rem_lockspace(struct sanlk_lockspace *ls, uint32_t flags) { return cmd_lockspace(SM_CMD_REM_LOCKSPACE, ls, flags, 0); } int sanlock_get_lockspaces(struct sanlk_lockspace **lss, int *lss_count, uint32_t flags) { struct sanlk_lockspace *lsbuf, *ls; struct sm_header h; int rv, fd, i, ret, recv_count; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_GET_LOCKSPACES, flags, 0, 0, 0); if (rv < 0) goto out; /* receive result and ls structs */ memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } /* -ENOSPC means that the daemon's send buffer ran out of space */ rv = (int)h.data; if (rv < 0 && rv != -ENOSPC) goto out; *lss_count = h.data2; recv_count = h.data2; if (!lss) goto out; lsbuf = malloc(recv_count * sizeof(struct sanlk_lockspace)); if (!lsbuf) goto out; ls = lsbuf; for (i = 0; i < recv_count; i++) { ret = recv_data(fd, ls, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (ret < 0) { rv = -errno; free(lsbuf); goto out; } if (ret != sizeof(struct sanlk_lockspace)) { rv = -1; free(lsbuf); goto out; } ls++; } *lss = lsbuf; out: close(fd); return rv; } int sanlock_get_hosts(const char *ls_name, uint64_t host_id, struct sanlk_host **hss, int *hss_count, uint32_t flags) { struct sm_header h; struct sanlk_lockspace ls; struct sanlk_host *hsbuf, *hs; int rv, fd, i, ret, recv_count; if (!ls_name) return -EINVAL; memset(&ls, 0, sizeof(struct sanlk_lockspace)); strncpy(ls.name, ls_name, SANLK_NAME_LEN); ls.host_id = host_id; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_GET_HOSTS, flags, sizeof(struct sanlk_lockspace), 0, 0); if (rv < 0) goto out; rv = send_data(fd, &ls, sizeof(struct sanlk_lockspace), 0); if (rv < 0) { rv = -errno; goto out; } /* receive result and ls structs */ memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } /* -ENOSPC means that the daemon's send buffer ran out of space */ rv = (int)h.data; if (rv < 0 && rv != -ENOSPC) goto out; *hss_count = h.data2; recv_count = h.data2; if (!hss) goto out; hsbuf = malloc(recv_count * sizeof(struct sanlk_host)); if (!hsbuf) goto out; hs = hsbuf; for (i = 0; i < recv_count; i++) { ret = recv_data(fd, hs, sizeof(struct sanlk_host), MSG_WAITALL); if (ret < 0) { rv = -errno; free(hsbuf); goto out; } if (ret != sizeof(struct sanlk_host)) { rv = -1; free(hsbuf); goto out; } hs++; } *hss = hsbuf; out: close(fd); return rv; } int sanlock_set_config(const char *ls_name, uint32_t flags, uint32_t cmd, GNUC_UNUSED void *data) { struct sanlk_lockspace ls; struct sm_header h; int rv, fd; if (!ls_name) return -EINVAL; memset(&ls, 0, sizeof(struct sanlk_lockspace)); strncpy(ls.name, ls_name, SANLK_NAME_LEN); rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_SET_CONFIG, flags, sizeof(struct sanlk_lockspace), cmd, 0); if (rv < 0) goto out; rv = send_data(fd, &ls, sizeof(ls), 0); if (rv < 0) { rv = -errno; goto out; } memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } rv = (int)h.data; out: close(fd); return rv; } int sanlock_align(struct sanlk_disk *disk) { int rv, fd; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_ALIGN, 0, sizeof(struct sanlk_disk), 0, 0); if (rv < 0) goto out; rv = send_data(fd, (void *)disk, sizeof(struct sanlk_disk), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_read_lockspace(struct sanlk_lockspace *ls, uint32_t flags, uint32_t *io_timeout) { struct sm_header h; int rv, fd; if (!ls || !ls->host_id_disk.path[0]) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_READ_LOCKSPACE, flags, sizeof(struct sanlk_lockspace), 0, 0); if (rv < 0) goto out; rv = send_data(fd, ls, sizeof(struct sanlk_lockspace), 0); if (rv < 0) { rv = -errno; goto out; } /* receive result, io_timeout and ls struct */ memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } rv = (int)h.data; if (rv < 0) goto out; rv = recv_data(fd, ls, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(struct sanlk_lockspace)) { rv = -1; goto out; } *io_timeout = h.data2; rv = (int)h.data; out: close(fd); return rv; } int sanlock_read_resource(struct sanlk_resource *res, uint32_t flags) { struct sm_header h; int rv, fd; if (!res || !res->num_disks || res->num_disks > SANLK_MAX_DISKS || !res->disks[0].path[0]) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_READ_RESOURCE, flags, sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk) * res->num_disks, 0, 0); if (rv < 0) goto out; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -errno; goto out; } /* receive result and res struct */ memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } rv = recv_data(fd, res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(struct sanlk_resource)) { rv = -1; goto out; } rv = (int)h.data; out: close(fd); return rv; } int sanlock_write_lockspace(struct sanlk_lockspace *ls, int max_hosts, uint32_t flags, uint32_t io_timeout) { int rv, fd; if (!ls || !ls->host_id_disk.path[0]) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_WRITE_LOCKSPACE, flags, sizeof(struct sanlk_lockspace), max_hosts, io_timeout); if (rv < 0) goto out; rv = send_data(fd, ls, sizeof(struct sanlk_lockspace), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_write_resource(struct sanlk_resource *res, int max_hosts, int num_hosts, uint32_t flags) { int rv, fd; if (!res || !res->num_disks || res->num_disks > SANLK_MAX_DISKS || !res->disks[0].path[0]) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_WRITE_RESOURCE, flags, sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk) * res->num_disks, max_hosts, num_hosts); if (rv < 0) goto out; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_read_resource_owners(struct sanlk_resource *res, uint32_t flags, struct sanlk_host **hss, int *hss_count) { struct sm_header h; struct sanlk_host *hsbuf, *hs; int rv, fd, i, ret, recv_count; if (!res || !res->num_disks || res->num_disks > SANLK_MAX_DISKS || !res->disks[0].path[0]) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_READ_RESOURCE_OWNERS, flags, sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk) * res->num_disks, 0, 0); if (rv < 0) goto out; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -errno; goto out; } /* receive result, res struct, and host structs */ memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } rv = (int)h.data; if (rv < 0) goto out; rv = recv_data(fd, res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(struct sanlk_resource)) { rv = -1; goto out; } rv = 0; *hss_count = h.data2; recv_count = h.data2; if (!hss) goto out; hsbuf = malloc(recv_count * sizeof(struct sanlk_host)); if (!hsbuf) goto out; hs = hsbuf; for (i = 0; i < recv_count; i++) { ret = recv_data(fd, hs, sizeof(struct sanlk_host), MSG_WAITALL); if (ret < 0) { rv = -errno; free(hsbuf); goto out; } if (ret != sizeof(struct sanlk_host)) { rv = -1; free(hsbuf); goto out; } hs++; } *hss = hsbuf; out: close(fd); return rv; } int sanlock_test_resource_owners(struct sanlk_resource *res GNUC_UNUSED, uint32_t flags GNUC_UNUSED, struct sanlk_host *owners, int owners_count, struct sanlk_host *hosts, int hosts_count, uint32_t *test_flags) { struct sanlk_host *owner, *host; int i, j, found, fail = 0; *test_flags = 0; owner = owners; for (i = 0; i < owners_count; i++) { found = 0; host = hosts; for (j = 0; j < hosts_count; j++) { if (owner->host_id != host->host_id) { host++; continue; } found = 1; break; } if (!found) goto next; if (host->generation > owner->generation) goto next; /* this should not be possible, and should never happen */ if (host->generation < owner->generation) return -EINVAL; switch (host->flags & SANLK_HOST_MASK) { case SANLK_HOST_FREE: case SANLK_HOST_DEAD: break; case SANLK_HOST_LIVE: case SANLK_HOST_FAIL: case SANLK_HOST_UNKNOWN: fail = 1; break; default: return -EINVAL; } next: owner++; } if (fail) *test_flags |= SANLK_TRF_FAIL; return 0; } int sanlock_reg_event(const char *ls_name, struct sanlk_host_event *he, uint32_t flags) { struct sm_header h; struct sanlk_lockspace ls; struct sanlk_host_event ev; int rv, reg_fd; if (!ls_name) return -EINVAL; memset(&ls, 0, sizeof(ls)); strncpy(ls.name, ls_name, SANLK_NAME_LEN); memset(&ev, 0, sizeof(ev)); if (he) memcpy(&ev, he, sizeof(ev)); rv = connect_socket(®_fd); if (rv < 0) return rv; rv = send_header(reg_fd, SM_CMD_REG_EVENT, flags, sizeof(ls) + sizeof(ev), 0, 0); if (rv < 0) goto fail; rv = send_data(reg_fd, &ls, sizeof(ls), 0); if (rv < 0) goto fail; rv = send_data(reg_fd, &ev, sizeof(ev), 0); if (rv < 0) goto fail; memset(&h, 0, sizeof(h)); rv = recv_data(reg_fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto fail; } if (rv != sizeof(h)) { rv = -1; goto fail; } rv = (int)h.data; if (rv < 0) goto fail; return reg_fd; fail: close(reg_fd); return rv; } int sanlock_end_event(int reg_fd, const char *ls_name, uint32_t flags) { struct sm_header h; struct sanlk_lockspace ls; int rv, fd; uint32_t end = 1; if (!ls_name) return -EINVAL; /* * write 4 bytes to the registered fd. sanlock attempts * a non-blocking read of 4 bytes from registered fds to * check if they have been unregistered. */ rv = send_data(reg_fd, &end, sizeof(end), 0); if (rv < 0) { close(reg_fd); return -EALREADY; } close(reg_fd); /* * sanlock does not poll registered event fds because * it receives nothing from them during normal operation, * only to indicate it's being closed. So, we need * to tell sanlock to check the registered event fds to * remove the one we've written to and closed above. */ memset(&ls, 0, sizeof(ls)); strncpy(ls.name, ls_name, SANLK_NAME_LEN); rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_END_EVENT, flags, sizeof(ls), 0, 0); if (rv < 0) goto out; rv = send_data(fd, &ls, sizeof(ls), 0); if (rv < 0) goto out; memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } rv = (int)h.data; if (rv < 0) goto out; rv = 0; out: close(fd); return rv; } int sanlock_set_event(const char *ls_name, struct sanlk_host_event *he, uint32_t flags) { struct sanlk_lockspace ls; struct sm_header h; int rv, fd; if (!ls_name || !he) return -EINVAL; memset(&ls, 0, sizeof(struct sanlk_lockspace)); strncpy(ls.name, ls_name, SANLK_NAME_LEN); rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_SET_EVENT, flags, sizeof(struct sanlk_lockspace) + sizeof(struct sanlk_host_event), 0, 0); if (rv < 0) goto out; rv = send_data(fd, &ls, sizeof(ls), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, he, sizeof(struct sanlk_host_event), 0); if (rv < 0) { rv = -errno; goto out; } memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } rv = (int)h.data; out: close(fd); return rv; } int sanlock_get_event(int reg_fd, GNUC_UNUSED uint32_t flags, struct sanlk_host_event *he, uint64_t *from_host_id, uint64_t *from_generation) { struct event_cb cb; int rv; /* * The caller's poll(2) indicates there's data, it doesn't know how * many events to read, and doesn't want to block, so they want to * get events until we return -EAGAIN to indicate there are no more. */ rv = recv_data(reg_fd, &cb, sizeof(cb), MSG_DONTWAIT); if (rv < 0) return -errno; if (rv != sizeof(cb)) return -1; memcpy(he, &cb.he, sizeof(struct sanlk_host_event)); if (from_host_id) *from_host_id = cb.from_host_id; if (from_generation) *from_generation = cb.from_generation; return 0; } /* old api */ int sanlock_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts) { if (ls) return sanlock_write_lockspace(ls, max_hosts, 0, 0); else return sanlock_write_resource(res, max_hosts, num_hosts, 0); } int sanlock_register(void) { int sock, rv; rv = connect_socket(&sock); if (rv < 0) return rv; rv = send_header(sock, SM_CMD_REGISTER, 0, 0, 0, 0); if (rv < 0) { close(sock); return rv; } return sock; } int sanlock_restrict(int sock, uint32_t flags) { int rv; rv = send_header(sock, SM_CMD_RESTRICT, flags, 0, 0, -1); if (rv < 0) return rv; rv = recv_result(sock); return rv; } int sanlock_version(uint32_t flags, uint32_t *version, uint32_t *proto) { struct sm_header h; int fd, rv; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_VERSION, flags, 0, 0, 0); if (rv < 0) goto out; memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } if (proto) *proto = h.version; rv = (int)h.data; if (rv < 0) goto out; *version = h.data2; rv = 0; out: close(fd); return rv; } int sanlock_killpath(int sock, uint32_t flags, const char *path, char *args) { char path_max[SANLK_HELPER_PATH_LEN]; char args_max[SANLK_HELPER_ARGS_LEN]; int rv, datalen; datalen = SANLK_HELPER_PATH_LEN + SANLK_HELPER_ARGS_LEN; memset(path_max, 0, sizeof(path_max)); memset(args_max, 0, sizeof(args_max)); snprintf(path_max, SANLK_HELPER_PATH_LEN-1, "%s", path); snprintf(args_max, SANLK_HELPER_ARGS_LEN-1, "%s", args); rv = send_header(sock, SM_CMD_KILLPATH, flags, datalen, 0, -1); if (rv < 0) return rv; rv = send_data(sock, path_max, SANLK_HELPER_PATH_LEN, 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(sock, args_max, SANLK_HELPER_ARGS_LEN, 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(sock); out: return rv; } int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[], struct sanlk_options *opt_in) { struct sanlk_resource *res; struct sanlk_options opt; int rv, i, fd, data2; int datalen = 0; if (res_count > SANLK_MAX_RESOURCES) return -EINVAL; for (i = 0; i < res_count; i++) { res = res_args[i]; datalen += sizeof(struct sanlk_resource); if (res->num_disks > SANLK_MAX_DISKS) return -EINVAL; datalen += (res->num_disks * sizeof(struct sanlk_disk)); } datalen += sizeof(struct sanlk_options); if (opt_in) { memcpy(&opt, opt_in, sizeof(struct sanlk_options)); datalen += opt_in->len; } else { memset(&opt, 0, sizeof(opt)); } if (sock == -1) { /* connect to daemon and ask it to acquire a lease for another registered pid */ data2 = pid; rv = connect_socket(&fd); if (rv < 0) return rv; } else { /* use our own existing registered connection and ask daemon to acquire a lease for self */ data2 = -1; fd = sock; } rv = send_header(fd, SM_CMD_ACQUIRE, flags, datalen, res_count, data2); if (rv < 0) return rv; for (i = 0; i < res_count; i++) { res = res_args[i]; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -1; goto out; } rv = send_data(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -1; goto out; } } rv = send_data(fd, &opt, sizeof(struct sanlk_options), 0); if (rv < 0) { rv = -1; goto out; } if (opt.len) { rv = send_data(fd, opt_in->str, opt.len, 0); if (rv < 0) { rv = -1; goto out; } } rv = recv_result(fd); out: if (sock == -1) close(fd); return rv; } int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, char **res_state) { struct sm_header h; char *reply_data = NULL; int rv, fd, data2, len; *res_count = 0; if (res_state) *res_state = NULL; if (sock == -1) { /* connect to daemon and ask it to acquire a lease for another registered pid */ data2 = pid; rv = connect_socket(&fd); if (rv < 0) return rv; } else { /* use our own existing registered connection and ask daemon to acquire a lease for self */ data2 = -1; fd = sock; } rv = send_header(fd, SM_CMD_INQUIRE, flags, 0, 0, data2); if (rv < 0) return rv; /* get result */ memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) { rv = -1; goto out; } len = h.length - sizeof(h); if (!len) { rv = (int)h.data; goto out; } reply_data = malloc(len); if (!reply_data) { rv = -ENOMEM; goto out; } rv = recv_data(fd, reply_data, len, MSG_WAITALL); if (rv != len) { free(reply_data); rv = -1; goto out; } if (res_state) *res_state = reply_data; else free(reply_data); *res_count = (int)h.data2; rv = (int)h.data; out: if (sock == -1) close(fd); return rv; } int sanlock_convert(int sock, int pid, uint32_t flags, struct sanlk_resource *res) { int fd, rv, data2, datalen; if (!res) return -EINVAL; if (sock == -1) { /* connect to daemon and ask it to acquire a lease for another registered pid */ data2 = pid; rv = connect_socket(&fd); if (rv < 0) return rv; } else { /* use our own existing registered connection and ask daemon to acquire a lease for self */ data2 = -1; fd = sock; } datalen = sizeof(struct sanlk_resource); rv = send_header(fd, SM_CMD_CONVERT, flags, datalen, 0, data2); if (rv < 0) goto out; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: if (sock == -1) close(fd); return rv; } /* tell daemon to release lease(s) for given pid. I don't think the pid itself will usually tell sm to release leases, but it will be requested by a manager overseeing the pid */ int sanlock_release(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[]) { int fd, rv, i, data2, datalen; if (sock == -1) { /* connect to daemon and ask it to acquire a lease for another registered pid */ data2 = pid; rv = connect_socket(&fd); if (rv < 0) return rv; } else { /* use our own existing registered connection and ask daemon to acquire a lease for self */ data2 = -1; fd = sock; } datalen = res_count * sizeof(struct sanlk_resource); rv = send_header(fd, SM_CMD_RELEASE, flags, datalen, res_count, data2); if (rv < 0) goto out; for (i = 0; i < res_count; i++) { rv = send_data(fd, res_args[i], sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -1; goto out; } } rv = recv_result(fd); out: if (sock == -1) close(fd); return rv; } int sanlock_request(uint32_t flags, uint32_t force_mode, struct sanlk_resource *res) { int fd, rv, datalen; if (!res) return -EINVAL; datalen = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk) * res->num_disks; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_REQUEST, flags, datalen, force_mode, 0); if (rv < 0) goto out; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, res->disks, sizeof(struct sanlk_disk) * res->num_disks, 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_examine(uint32_t flags, struct sanlk_lockspace *ls, struct sanlk_resource *res) { char *data; int rv, fd, cmd, datalen; if (!ls && !res) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; if (ls && ls->host_id_disk.path[0]) { cmd = SM_CMD_EXAMINE_LOCKSPACE; datalen = sizeof(struct sanlk_lockspace); data = (char *)ls; } else { cmd = SM_CMD_EXAMINE_RESOURCE; datalen = sizeof(struct sanlk_resource); data = (char *)res; } rv = send_header(fd, cmd, flags, datalen, 0, 0); if (rv < 0) goto out; rv = send_data(fd, data, datalen, 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_set_lvb(uint32_t flags, struct sanlk_resource *res, char *lvb, int lvblen) { int datalen = 0; int rv, fd; if (!res || !lvb || !lvblen) return -EINVAL; datalen = sizeof(struct sanlk_resource) + lvblen; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_SET_LVB, flags, datalen, 0, 0); if (rv < 0) return rv; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, lvb, lvblen, 0); if (rv < 0) { rv = -1; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_get_lvb(uint32_t flags, struct sanlk_resource *res, char *lvb, int lvblen) { struct sm_header h; char *reply_data = NULL; int datalen = 0; int rv, fd, len; if (!res || !lvb || !lvblen) return -EINVAL; datalen = sizeof(struct sanlk_resource); rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_GET_LVB, flags, datalen, 0, 0); if (rv < 0) return rv; rv = send_data(fd, res, sizeof(struct sanlk_resource), 0); if (rv < 0) { rv = -1; goto out; } /* get result */ memset(&h, 0, sizeof(h)); rv = recv_data(fd, &h, sizeof(h), MSG_WAITALL); if (rv != sizeof(h)) { rv = -1; goto out; } len = h.length - sizeof(h); if (!len) { rv = (int)h.data; goto out; } reply_data = malloc(len); if (!reply_data) { rv = -ENOMEM; goto out; } rv = recv_data(fd, reply_data, len, MSG_WAITALL); if (rv != len) { free(reply_data); rv = -1; goto out; } if (lvblen < len) len = lvblen; memcpy(lvb, reply_data, len); free(reply_data); rv = (int)h.data; out: close(fd); return rv; } int sanlock_format_rindex(struct sanlk_rindex *rx, uint32_t flags) { int rv, fd; if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0]) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_FORMAT_RINDEX, flags, sizeof(struct sanlk_rindex), 0, 0); if (rv < 0) goto out; rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_rebuild_rindex(struct sanlk_rindex *rx, uint32_t flags) { int rv, fd; if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0]) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_REBUILD_RINDEX, flags, sizeof(struct sanlk_rindex), 0, 0); if (rv < 0) goto out; rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); if (rv < 0) { rv = -errno; goto out; } rv = recv_result(fd); out: close(fd); return rv; } int sanlock_update_rindex(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re) { struct sanlk_rentry re_recv; int rv, fd; memset(&re_recv, 0, sizeof(re_recv)); if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_UPDATE_RINDEX, flags, sizeof(struct sanlk_rindex) + sizeof(struct sanlk_rentry), 0, 0); if (rv < 0) goto out; rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); if (rv < 0) { rv = -1; goto out; } rv = recv_result(fd); if (rv < 0) goto out; rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(struct sanlk_rentry)) { rv = -1; goto out; } memcpy(re, &re_recv, sizeof(struct sanlk_rentry)); rv = 0; out: close(fd); return rv; } int sanlock_lookup_rindex(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re) { struct sanlk_rentry re_recv; int rv, fd; memset(&re_recv, 0, sizeof(re_recv)); if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_LOOKUP_RINDEX, flags, sizeof(struct sanlk_rindex) + sizeof(struct sanlk_resource), 0, 0); if (rv < 0) goto out; rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); if (rv < 0) { rv = -1; goto out; } rv = recv_result(fd); if (rv < 0) goto out; rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(struct sanlk_rentry)) { rv = -1; goto out; } memcpy(re, &re_recv, sizeof(struct sanlk_rentry)); rv = 0; out: close(fd); return rv; } int sanlock_create_resource(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re, int max_hosts, int num_hosts) { struct sanlk_rentry re_recv; int rv, fd; memset(&re_recv, 0, sizeof(re_recv)); if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_CREATE_RESOURCE, flags, sizeof(struct sanlk_rindex) + sizeof(struct sanlk_rentry), max_hosts, num_hosts); if (rv < 0) goto out; rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); if (rv < 0) { rv = -1; goto out; } rv = recv_result(fd); if (rv < 0) goto out; rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(struct sanlk_rentry)) { rv = -1; goto out; } memcpy(re, &re_recv, sizeof(struct sanlk_rentry)); rv = 0; out: close(fd); return rv; } int sanlock_delete_resource(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re) { struct sanlk_rentry re_recv; int rv, fd; memset(&re_recv, 0, sizeof(re_recv)); if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) return -EINVAL; rv = connect_socket(&fd); if (rv < 0) return rv; rv = send_header(fd, SM_CMD_DELETE_RESOURCE, flags, sizeof(struct sanlk_rindex) + sizeof(struct sanlk_rentry), 0, 0); if (rv < 0) goto out; rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); if (rv < 0) { rv = -errno; goto out; } rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); if (rv < 0) { rv = -1; goto out; } rv = recv_result(fd); if (rv < 0) goto out; rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(struct sanlk_rentry)) { rv = -1; goto out; } rv = 0; out: close(fd); return rv; } /* * src may have colons/spaces escaped (with backslash) or unescaped. * if unescaped colons/spaces are found, insert backslash before them. * * returns strlen of dst. */ size_t sanlock_path_export(char *dst, const char *src, size_t dstlen) { int i = 0; /* pos in src */ int j = 0; /* pos in dst */ memset(dst, 0, dstlen); for (i = 0; i < strlen(src); i++) { /* take an escape character plus whatever follows it. */ if (src[i] == '\\') { if (j > dstlen - 3) goto out; dst[j] = src[i]; j++; i++; dst[j] = src[i]; goto next_char; } /* add escape character before an unescaped space or colon. */ if ((src[i] == ' ') || (src[i] == ':')) { if (j > dstlen - 3) goto out; dst[j] = '\\'; j++; dst[j] = src[i]; goto next_char; } /* copy non-special char from src to dst. */ if (j > dstlen - 2) goto out; dst[j] = src[i]; next_char: if (dst[j] == '\0') goto out; j++; } out: return strlen(dst); } /* src has colons/spaces escaped with backslash, dst should have backslash removed */ size_t sanlock_path_import(char *dst, const char *src, size_t dstlen) { size_t j = 0; const char *p = src; while (j < dstlen) { if (*p == '\\') goto next_loop; dst[j] = *p; if (*p == '\0') return j; j++; next_loop: p++; } return 0; } /* * convert from struct sanlk_resource to string with format: * :::[::...]: */ int sanlock_res_to_str(struct sanlk_resource *res, char **str_ret) { char path[SANLK_PATH_LEN + 1]; char *str; int ret, len, pos, d; if (!res) return -EINVAL; str = malloc(SANLK_MAX_RES_STR + 1); if (!str) return -ENOMEM; memset(str, 0, SANLK_MAX_RES_STR + 1); len = SANLK_MAX_RES_STR; pos = 0; ret = snprintf(str + pos, len - pos, "%s:%s", res->lockspace_name, res->name); if (ret >= len - pos) goto fail; pos += ret; for (d = 0; d < res->num_disks; d++) { memset(path, 0, sizeof(path)); sanlock_path_export(path, res->disks[d].path, sizeof(path)); ret = snprintf(str + pos, len - pos, ":%s:%llu", path, (unsigned long long)res->disks[d].offset); if (ret >= len - pos) goto fail; pos += ret; } if (res->flags & SANLK_RES_SHARED) ret = snprintf(str + pos, len - pos, ":SH"); else ret = snprintf(str + pos, len - pos, ":%llu", (unsigned long long)res->lver); if (ret > len - pos) goto fail; pos += ret; if (pos > len) goto fail; *str_ret = str; return 0; fail: free(str); return -EINVAL; } /* * convert to struct sanlk_resource from string with format: * :::[::...][:] * * If str contains a backslash escape character, the backslash needs to be * excluded from the string in res struct. The path string in the res struct * needs to be suitable for passing to open(2), which means it should not * include escape characters. */ int sanlock_str_to_res(char *str, struct sanlk_resource **res_ret) { struct sanlk_resource *res; char sub[SANLK_PATH_LEN + 1]; int i, j, d, rv, len, sub_count, colons, num_disks, have_lver; if (strlen(str) < 3) return -ENXIO; colons = 0; for (i = 0; i < strlen(str); i++) { if (str[i] == '\\') { i++; continue; } if (str[i] == ':') colons++; } if (!colons || (colons == 2)) { return -1; } num_disks = (colons - 1) / 2; have_lver = (colons - 1) % 2; if (num_disks > SANLK_MAX_DISKS) return -2; len = sizeof(struct sanlk_resource) + num_disks * sizeof(struct sanlk_disk); res = malloc(len); if (!res) return -ENOMEM; memset(res, 0, len); res->num_disks = num_disks; d = 0; sub_count = 0; j = 0; memset(sub, 0, sizeof(sub)); len = strlen(str); for (i = 0; i < len + 1; i++) { if (str[i] == '\\') { if (i == (len - 1)) goto fail; if (j >= SANLK_PATH_LEN) goto fail; i++; sub[j++] = str[i]; continue; } if (i < len && str[i] != ':') { if (j >= SANLK_PATH_LEN) goto fail; sub[j++] = str[i]; continue; } /* do something with sub when we hit ':' or end of str, first and second subs are lockspace and resource names, then even sub is path, odd sub is offset */ if (sub_count < 2 && strlen(sub) > SANLK_NAME_LEN) goto fail; if (sub_count >= 2 && (strlen(sub) > SANLK_PATH_LEN-1 || strlen(sub) < 1)) goto fail; if (sub_count == 0) { strncpy(res->lockspace_name, sub, SANLK_NAME_LEN); } else if (sub_count == 1) { strncpy(res->name, sub, SANLK_NAME_LEN); } else if (!(sub_count % 2)) { if (have_lver && (d == num_disks)) { if (!strncmp(sub, "SH", 2)) { res->flags |= SANLK_RES_SHARED; } else { res->flags |= SANLK_RES_LVER; res->lver = strtoull(sub, NULL, 0); } } else { strncpy(res->disks[d].path, sub, SANLK_PATH_LEN - 1); } } else { rv = sscanf(sub, "%llu", (unsigned long long *)&res->disks[d].offset); if (rv != 1) goto fail; d++; } sub_count++; j = 0; memset(sub, 0, sizeof(sub)); } *res_ret = res; return 0; fail: free(res); return -1; } /* * convert from array of struct sanlk_resource * to state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_res_to_str() comment */ int sanlock_args_to_state(int res_count, struct sanlk_resource *res_args[], char **res_state) { char *str, *state; int i, rv; state = malloc(res_count * (SANLK_MAX_RES_STR + 1)); if (!state) return -ENOMEM; memset(state, 0, res_count * (SANLK_MAX_RES_STR + 1)); for (i = 0; i < res_count; i++) { str = NULL; rv = sanlock_res_to_str(res_args[i], &str); if (rv < 0 || !str) { free(state); return rv; } if (strlen(str) > SANLK_MAX_RES_STR - 1) { free(str); free(state); return -EINVAL; } if (i) strcat(state, " "); strcat(state, str); free(str); } /* caller to free state */ *res_state = state; return 0; } /* * convert to array of struct sanlk_resource * from state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_str_to_res() comment */ int sanlock_state_to_args(char *res_state, int *res_count, struct sanlk_resource ***res_args) { struct sanlk_resource **args; struct sanlk_resource *res; char str[SANLK_MAX_RES_STR + 1]; int count = 1, arg_count = 0; int escape = 0; int sep_colons = 0; int i, j, len, rv; for (i = 0; i < strlen(res_state); i++) { if (res_state[i] == '\\') { i++; continue; } if (res_state[i] == ' ') count++; } *res_count = count; args = malloc(count * sizeof(*args)); if (!args) return -ENOMEM; memset(args, 0, count * sizeof(*args)); j = 0; memset(str, 0, sizeof(str)); sep_colons = 0; len = strlen(res_state); for (i = 0; i < len + 1; i++) { if (i < len && res_state[i] == '\\') { str[j++] = res_state[i]; escape = 1; continue; } if (i < len && escape) { str[j++] = res_state[i]; escape = 0; continue; } if ((i < len) && (res_state[i] == ' ') && (sep_colons < 3)) { /* * This is a bit dubious. It's meant to detect when * a res string contains an unescaped space, and * inserts an escape char before it. An unescaped * space within a res string would otherwise be * misinterpreted as a separator between res strings. * If we've not yet seen three colons within a single * res string, then we should not be at the end yet. */ str[j++] = '\\'; str[j++] = res_state[i]; continue; } if (i < len && res_state[i] != ' ') { if (res_state[i] == ':') sep_colons++; str[j++] = res_state[i]; continue; } rv = sanlock_str_to_res(str, &res); if (rv < 0 || !res) goto fail_free; if (arg_count == count) goto fail_free; args[arg_count++] = res; j = 0; memset(str, 0, sizeof(str)); sep_colons = 0; } /* caller to free res_count res and args */ *res_count = arg_count; *res_args = args; return 0; fail_free: for (i = 0; i < count; i++) { if (args[i]) free(args[i]); } free(args); return rv; } /* * convert to struct sanlk_lockspace from string with format: * ::: */ int sanlock_str_to_lockspace(char *str, struct sanlk_lockspace *ls) { char *host_id = NULL; char *path = NULL; char *offset = NULL; int i; if (!str) return -EINVAL; for (i = 0; i < strlen(str); i++) { if (str[i] == '\\') { i++; continue; } if (str[i] == ':') { if (!host_id) host_id = &str[i]; else if (!path) path = &str[i]; else if (!offset) offset = &str[i]; } } if (host_id) { *host_id = '\0'; host_id++; } if (path) { *path = '\0'; path++; } if (offset) { *offset= '\0'; offset++; } strncpy(ls->name, str, SANLK_NAME_LEN); if (host_id) ls->host_id = atoll(host_id); if (path) sanlock_path_import(ls->host_id_disk.path, path, sizeof(ls->host_id_disk.path)); if (offset) ls->host_id_disk.offset = atoll(offset); return 0; } const char *sanlock_strerror(int rv) { switch (rv) { case SANLK_ERROR: return "General error"; case SANLK_AIO_TIMEOUT: return "IO timeout"; case SANLK_WD_ERROR: return "Watchdog device error"; case SANLK_DBLOCK_READ: return "Lease read error in dblock"; case SANLK_DBLOCK_WRITE: return "Lease write error in dblock"; case SANLK_DBLOCK_MBAL: return "Lease was acquired by another host in current ballot"; case SANLK_DBLOCK_LVER: return "Lease was acquired by another host in new ballot"; case SANLK_DBLOCK_CHECKSUM: return "Lease checksum error in dblock"; case SANLK_LEADER_READ: return "Lease read error in leader"; case SANLK_LEADER_WRITE: return "Lease write error in leader"; case SANLK_LEADER_DIFF: return "Lease read inconsistent"; case SANLK_LEADER_MAGIC: return "Lease does not exist on storage"; case SANLK_LEADER_VERSION: return "Lease format version on storage is not recognized"; case SANLK_LEADER_SECTORSIZE: return "Lease sector size is inconsistent"; case SANLK_LEADER_LOCKSPACE: return "Lease lockspace name is incorrect"; case SANLK_LEADER_RESOURCE: return "Lease resource name is incorrect"; case SANLK_LEADER_NUMHOSTS: return "Lease num_hosts is incorrect"; case SANLK_LEADER_CHECKSUM: return "Lease checksum error in leader"; case SANLK_ACQUIRE_LOCKSPACE: return "Lease lockspace is not found"; case SANLK_ACQUIRE_IDDISK: return "Lease lockspace disk cannot be opened"; case SANLK_ACQUIRE_IDLIVE: return "Lease is held by another host"; case SANLK_ACQUIRE_OWNED: return "Lease was acquired by another host in other ballot"; case SANLK_ACQUIRE_OTHER: return "Lease was acquired by another host in local commit"; case SANLK_ACQUIRE_SHRETRY: return "Lease is held by another host for shared acquire"; case SANLK_ACQUIRE_OWNED_RETRY: return "Lease is owned by another host"; case SANLK_RELEASE_LVER: return "Lease release version is incorrect"; case SANLK_RELEASE_OWNER: return "Lease release owner is incorrect"; case SANLK_RENEW_OWNER: return "Lease renew owner is incorrect"; case SANLK_RENEW_DIFF: return "Lease renew data has changed"; case SANLK_HOSTID_BUSY: return "Lease host ID is being used by another host"; case SANLK_REQUEST_MAGIC: return "Lease request block has invalid data"; case SANLK_REQUEST_VERSION: return "Lease request block has invalid version"; case SANLK_REQUEST_OLD: return "Lease request has newer lease version"; case SANLK_REQUEST_LVER: return "Lease request block has newer version"; default: return "Unknown error"; }; } sanlock-3.8.2/src/client_cmd.c000066400000000000000000000334741371427612200162320ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_sock.h" #include "client_cmd.h" #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif extern int send_command(int cmd, uint32_t data); static void print_debug(char *str, int len) { char *p; int i; p = &str[0]; for (i = 0; i < len-1; i++) { if (str[i] == ' ') { str[i] = '\0'; printf(" %s\n", p); p = &str[i+1]; } } if (p) printf(" %s\n", p); } static void status_daemon(struct sanlk_state *st, char *str, int debug) { printf("daemon %.48s\n", st->name); if (st->str_len && debug) print_debug(str, st->str_len); } static void status_client(struct sanlk_state *st, char *str, int debug) { printf("p %d ", st->data32); printf("%.48s\n", st->name); if (st->str_len && debug) print_debug(str, st->str_len); } static const char *add_rem_str(struct sanlk_state *st, char *str) { if (!st->str_len) return NULL; if (strstr(str, "list=add")) return "ADD"; if (strstr(str, "list=rem")) return "REM"; if (strstr(str, "list=orphan")) return "ORPHAN"; return NULL; } /* TODO: when path strings are exported, through status or inquire, we should export into a malloced buffer the size of the standard chars plus extra esc chars. */ static void status_lockspace(struct sanlk_state *st, char *str, char *bin, int debug) { struct sanlk_lockspace *ls = (struct sanlk_lockspace *)bin; char path[SANLK_PATH_LEN + 1]; const char *add_rem; memset(path, 0, sizeof(path)); sanlock_path_export(path, ls->host_id_disk.path, sizeof(path)); printf("s %.48s:%llu:%s:%llu", ls->name, (unsigned long long)ls->host_id, path, (unsigned long long)ls->host_id_disk.offset); add_rem = add_rem_str(st, str); if (add_rem) printf(" %s\n", add_rem); else printf("\n"); if (st->str_len && debug) print_debug(str, st->str_len); } static void status_resource(struct sanlk_state *st, char *str, char *bin, int debug) { struct sanlk_resource *res = (struct sanlk_resource *)bin; struct sanlk_disk *disk; char path[SANLK_PATH_LEN + 1]; const char *add_rem; int i; printf("r %.48s:%.48s", res->lockspace_name, res->name); for (i = 0; i < res->num_disks; i++) { disk = (struct sanlk_disk *)(bin + sizeof(struct sanlk_resource) + i * sizeof(struct sanlk_disk)); memset(path, 0, sizeof(path)); sanlock_path_export(path, disk->path, sizeof(path)); printf(":%s:%llu", path, (unsigned long long)disk->offset); } if (res->flags & SANLK_RES_SHARED) printf(":SH p %u", st->data32); else printf(":%llu p %u", (unsigned long long)st->data64, st->data32); add_rem = add_rem_str(st, str); if (add_rem) printf(" %s\n", add_rem); else printf("\n"); if (st->str_len && debug) print_debug(str, st->str_len); } static void status_host(struct sanlk_state *st, char *str, int debug) { printf("%u timestamp %llu\n", st->data32, (unsigned long long)st->data64); if (st->str_len && debug) print_debug(str, st->str_len); } static void print_st(struct sanlk_state *st, char *str, char *bin, int debug) { switch (st->type) { case SANLK_STATE_DAEMON: status_daemon(st, str, debug); break; case SANLK_STATE_CLIENT: status_client(st, str, debug); break; case SANLK_STATE_LOCKSPACE: status_lockspace(st, str, bin, debug); break; case SANLK_STATE_RESOURCE: status_resource(st, str, bin, debug); break; } } #define MAX_SORT_ENTRIES 1024 static char *sort_bufs[MAX_SORT_ENTRIES]; static int sort_count; static int sort_done; static void print_type(int type, int debug) { struct sanlk_state *st; char *buf, *str, *bin; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; if (!type || st->type == type) { print_st(st, str, bin, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } } static void print_p(int p, int debug) { struct sanlk_state *st; char *buf, *str, *bin; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; if (st->type != SANLK_STATE_CLIENT) continue; if (st->data32 == p) { print_st(st, str, bin, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } } static int find_type(int type, int *sort_index) { struct sanlk_state *st; char *buf; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; if (st->type == type) { *sort_index = i; return 0; } } return -1; } static void print_r(int p, char *s, int debug) { struct sanlk_resource *res; struct sanlk_state *st; char *buf, *str, *bin; int i; for (i = 0; i < sort_count; i++) { buf = sort_bufs[i]; if (!buf) continue; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; if (st->type != SANLK_STATE_RESOURCE) continue; res = (struct sanlk_resource *)bin; if ((p && st->data32 == p) || (s && !strncmp(s, res->lockspace_name, SANLK_NAME_LEN))) { print_st(st, str, bin, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } } static void print_r_by_p(int debug) { struct sanlk_state *st; char *buf, *str, *bin; int rv, i; while (1) { rv = find_type(SANLK_STATE_CLIENT, &i); if (rv < 0) return; buf = sort_bufs[i]; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; print_st(st, str, bin, debug); print_r(st->data32, NULL, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } static void print_r_by_s(int debug) { struct sanlk_state *st; char *buf, *str, *bin; int rv, i; while (1) { rv = find_type(SANLK_STATE_LOCKSPACE, &i); if (rv < 0) return; buf = sort_bufs[i]; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; print_st(st, str, bin, debug); print_r(0, st->name, debug); free(buf); sort_bufs[i] = NULL; sort_done++; } } static void recv_bin(int fd, struct sanlk_state *st, char *bin) { struct sanlk_resource *res; if (st->type == SANLK_STATE_LOCKSPACE) { recv(fd, bin, sizeof(struct sanlk_lockspace), MSG_WAITALL); } else if (st->type == SANLK_STATE_RESOURCE) { recv(fd, bin, sizeof(struct sanlk_resource), MSG_WAITALL); res = (struct sanlk_resource *)bin; recv(fd, bin+sizeof(struct sanlk_resource), res->num_disks * sizeof(struct sanlk_disk), MSG_WAITALL); } } int sanlock_status(int debug, char sort_arg) { struct sm_header h; struct sanlk_state state; char maxstr[SANLK_STATE_MAXSTR]; char maxbin[SANLK_STATE_MAXSTR]; struct sanlk_state *st; char *buf = NULL, *str, *bin; int fd, rv, len; int sort_p = 0, sort_s = 0; if (sort_arg == 'p') sort_p = 1; else if (sort_arg == 's') sort_s = 1; fd = send_command(SM_CMD_STATUS, 0); if (fd < 0) return fd; rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } st = &state; str = maxstr; bin = maxbin; while (1) { if (sort_s || sort_p) { len = sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR*4; buf = calloc(len, sizeof(char)); if (!buf) return -ENOMEM; st = (struct sanlk_state *)buf; str = buf + sizeof(struct sanlk_state); bin = buf + sizeof(struct sanlk_state) + SANLK_STATE_MAXSTR; } else { memset(&state, 0, sizeof(state)); memset(maxstr, 0, sizeof(maxstr)); memset(maxbin, 0, sizeof(maxbin)); } rv = recv(fd, st, sizeof(struct sanlk_state), MSG_WAITALL); if (!rv) break; if (rv != sizeof(struct sanlk_state)) break; if (st->str_len) { rv = recv(fd, str, st->str_len, MSG_WAITALL); if (rv != st->str_len) break; } recv_bin(fd, st, bin); if (sort_s || sort_p) { if ((sort_count == MAX_SORT_ENTRIES) || (!buf)) { printf("cannot sort over %d\n", MAX_SORT_ENTRIES); goto out; } sort_bufs[sort_count++] = buf; continue; } /* no sorting, print as received */ print_st(st, str, bin, debug); } if (sort_p) { print_type(SANLK_STATE_DAEMON, debug); print_p(-1, debug); print_type(SANLK_STATE_LOCKSPACE, debug); print_r_by_p(debug); if (sort_done < sort_count) { printf("-\n"); print_type(0, debug); } } else if (sort_s) { print_type(SANLK_STATE_DAEMON, debug); print_p(-1, debug); print_type(SANLK_STATE_CLIENT, debug); print_r_by_s(debug); if (sort_done < sort_count) { printf("-\n"); print_type(0, debug); } } rv = 0; out: close(fd); return rv; } static int lockspace_host_status(int debug, char *lockspace_name) { struct sm_header h; struct sanlk_state st; struct sanlk_lockspace lockspace; char str[SANLK_STATE_MAXSTR]; int fd, rv; if (!lockspace_name || !lockspace_name[0]) return -1; fd = send_command(SM_CMD_HOST_STATUS, 0); if (fd < 0) return fd; memset(&lockspace, 0, sizeof(lockspace)); snprintf(lockspace.name, SANLK_NAME_LEN, "%s", lockspace_name); rv = send(fd, &lockspace, sizeof(lockspace), 0); if (rv < 0) goto out; rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } while (1) { rv = recv(fd, &st, sizeof(st), MSG_WAITALL); if (!rv) break; if (rv != sizeof(st)) break; if (st.str_len) { rv = recv(fd, str, st.str_len, MSG_WAITALL); if (rv != st.str_len) break; } switch (st.type) { case SANLK_STATE_HOST: status_host(&st, str, debug); break; } } rv = h.data; out: close(fd); return rv; } int sanlock_host_status(int debug, char *lockspace_name) { struct sm_header h; struct sanlk_state state; char maxstr[SANLK_STATE_MAXSTR]; char maxbin[SANLK_STATE_MAXSTR]; struct sanlk_state *st; char *str, *bin; struct sanlk_lockspace *ls; int fd, rv, i; if (lockspace_name && lockspace_name[0]) return lockspace_host_status(debug, lockspace_name); fd = send_command(SM_CMD_STATUS, SANLK_STATE_LOCKSPACE); if (fd < 0) return fd; rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; close(fd); return rv; } if (rv != sizeof(h)) { close(fd); return -1; } st = &state; str = maxstr; bin = maxbin; while (1) { memset(&state, 0, sizeof(state)); memset(maxstr, 0, sizeof(maxstr)); memset(maxbin, 0, sizeof(maxbin)); rv = recv(fd, st, sizeof(struct sanlk_state), MSG_WAITALL); if (!rv) break; if (rv != sizeof(struct sanlk_state)) break; if (st->str_len) { rv = recv(fd, str, st->str_len, MSG_WAITALL); if (rv != st->str_len) break; } recv_bin(fd, st, bin); if (st->type != SANLK_STATE_LOCKSPACE) continue; ls = (struct sanlk_lockspace *)bin; sort_bufs[sort_count++] = strdup(ls->name); } close(fd); for (i = 0; i < sort_count; i++) { printf("lockspace %s\n", sort_bufs[i]); lockspace_host_status(debug, sort_bufs[i]); free(sort_bufs[i]); } return 0; } int sanlock_renewal(char *lockspace_name) { struct sm_header h; struct sanlk_state st; struct sanlk_lockspace lockspace; char str[SANLK_STATE_MAXSTR]; int fd, rv; if (!lockspace_name || !lockspace_name[0]) return -1; fd = send_command(SM_CMD_RENEWAL, 0); if (fd < 0) return fd; memset(&lockspace, 0, sizeof(lockspace)); snprintf(lockspace.name, SANLK_NAME_LEN, "%s", lockspace_name); rv = send(fd, &lockspace, sizeof(lockspace), 0); if (rv < 0) goto out; rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } while (1) { rv = recv(fd, &st, sizeof(st), MSG_WAITALL); if (!rv) break; if (rv != sizeof(st)) break; if (st.str_len) { rv = recv(fd, str, st.str_len, MSG_WAITALL); if (rv != st.str_len) break; } printf("%s\n", str); } rv = h.data; out: close(fd); return rv; } int sanlock_log_dump(int max_size) { struct sm_header h; char *buf; int fd, rv; buf = malloc(max_size); if (!buf) return -ENOMEM; memset(buf, 0, max_size); fd = send_command(SM_CMD_LOG_DUMP, 0); if (fd < 0) { free(buf); return fd; } memset(&h, 0, sizeof(h)); rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } if (h.data <= 0 || h.data > max_size) goto out; rv = recv(fd, buf, h.data, MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (!rv) { rv = -1; goto out; } printf("%s", buf); printf("\n"); if (rv != h.data) printf("partial dump %d of %d\n", rv, h.data); out: close(fd); free(buf); return rv; } int sanlock_shutdown(uint32_t force, int wait_result) { struct sm_header h; int cmd; int fd; int rv = 0; if (wait_result) cmd = SM_CMD_SHUTDOWN_WAIT; else cmd = SM_CMD_SHUTDOWN; fd = send_command(cmd, force); if (fd < 0) return fd; if (cmd != SM_CMD_SHUTDOWN_WAIT) goto out; memset(&h, 0, sizeof(h)); rv = recv(fd, &h, sizeof(h), MSG_WAITALL); if (rv < 0) { rv = -errno; goto out; } if (rv != sizeof(h)) { rv = -1; goto out; } rv = h.data; out: close(fd); return rv; } sanlock-3.8.2/src/client_cmd.h000066400000000000000000000010711371427612200162230ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __CLIENT_CMD_H__ #define __CLIENT_CMD_H__ int sanlock_status(int debug, char sort_arg); int sanlock_host_status(int debug, char *lockspace_name); int sanlock_renewal(char *lockspace_name); int sanlock_log_dump(int max_size); int sanlock_shutdown(uint32_t force, int wait_result); #endif sanlock-3.8.2/src/cmd.c000066400000000000000000002317231371427612200146710ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock_admin.h" #include "sanlock_sock.h" #include "diskio.h" #include "log.h" #include "paxos_lease.h" #include "delta_lease.h" #include "lockspace.h" #include "resource.h" #include "direct.h" #include "task.h" #include "cmd.h" #include "rindex.h" /* from main.c */ void client_resume(int ci); void client_free(int ci); void client_recv_all(int ci, struct sm_header *h_recv, int pos); void client_pid_dead(int ci); void send_result(int ci, int fd, struct sm_header *h_recv, int result); static uint32_t token_id_counter = 1; static void release_cl_tokens(struct task *task, struct client *cl) { struct token *token; int j; for (j = 0; j < cl->tokens_slots; j++) { token = cl->tokens[j]; if (!token) continue; release_token(task, token, NULL); free(token); } } static void release_new_tokens(struct task *task, struct token *new_tokens[], int alloc_count, int acquire_count) { int i; for (i = 0; i < acquire_count; i++) release_token(task, new_tokens[i], NULL); for (i = 0; i < alloc_count; i++) free(new_tokens[i]); } /* called with both spaces_mutex and cl->mutex held */ static int check_new_tokens_space(struct client *cl, struct token *new_tokens[], int new_tokens_count) { struct space_info spi; struct token *token; int i, rv, empty_slots = 0; for (i = 0; i < cl->tokens_slots; i++) { if (!cl->tokens[i]) empty_slots++; } if (empty_slots < new_tokens_count) { /* shouldn't ever happen */ log_error("check_new_tokens_space slots %d empty %d new_tokens %d", cl->tokens_slots, empty_slots, new_tokens_count); return -ENOENT; } /* space may have failed while new tokens were being acquired */ for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; rv = _lockspace_info(token->r.lockspace_name, &spi); if (!rv && !spi.killing_pids && spi.host_id == token->host_id) continue; return -ENOSPC; } return 0; } static const char *acquire_error_str(int error) { switch (error) { case SANLK_ACQUIRE_IDLIVE: case SANLK_ACQUIRE_OWNED: case SANLK_ACQUIRE_OTHER: case SANLK_ACQUIRE_OWNED_RETRY: return "lease owned by other host"; case SANLK_ACQUIRE_SHRETRY: return "shared lease contention"; case SANLK_DBLOCK_READ: case SANLK_DBLOCK_WRITE: case SANLK_LEADER_READ: case SANLK_LEADER_WRITE: return "lease io error"; case SANLK_LEADER_DIFF: case SANLK_LEADER_VERSION: case SANLK_LEADER_SECTORSIZE: case SANLK_LEADER_LOCKSPACE: case SANLK_LEADER_RESOURCE: case SANLK_LEADER_NUMHOSTS: case SANLK_LEADER_CHECKSUM: return "lease data invalid"; case SANLK_LEADER_MAGIC: return "lease not found"; default: return ""; }; } static void cmd_acquire(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct client *cl; struct token *token = NULL; struct token *new_tokens[SANLK_MAX_RESOURCES]; struct token **grow_tokens; struct sanlk_resource res; struct sanlk_options opt; struct space_info spi; char killpath[SANLK_HELPER_PATH_LEN]; char killargs[SANLK_HELPER_ARGS_LEN]; char *opt_str; int token_len, disks_len; int fd, rv, i, j, empty_slots, lvl; int alloc_count = 0, acquire_count = 0; int pos = 0, pid_dead = 0; int new_tokens_count; int recv_done = 0; int result = 0; int grow_size; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; cl = &client[cl_ci]; fd = client[ca->ci_in].fd; new_tokens_count = ca->header.data; log_cmd(cmd, "cmd_acquire %d,%d,%d ci_in %d fd %d count %d flags %x", cl_ci, cl_fd, cl_pid, ca->ci_in, fd, new_tokens_count, ca->header.cmd_flags); if (new_tokens_count > SANLK_MAX_RESOURCES) { log_error("cmd_acquire %d,%d,%d new %d max %d", cl_ci, cl_fd, cl_pid, new_tokens_count, SANLK_MAX_RESOURCES); result = -E2BIG; goto done; } pthread_mutex_lock(&cl->mutex); if (cl->pid_dead) { result = -ESTALE; pthread_mutex_unlock(&cl->mutex); goto done; } empty_slots = 0; for (i = 0; i < cl->tokens_slots; i++) { if (!cl->tokens[i]) empty_slots++; } if (empty_slots < new_tokens_count) { log_debug("cmd_acquire grow tokens slots %d empty %d new %d", cl->tokens_slots, empty_slots, new_tokens_count); grow_size = (cl->tokens_slots + (SANLK_MAX_RESOURCES * 2)) * sizeof(struct token *); grow_tokens = malloc(grow_size); if (!grow_tokens) { log_error("cmd_acquire ENOMEM grow tokens slots %d empty %d new %d grow_size %d", cl->tokens_slots, empty_slots, new_tokens_count, grow_size); result = -ENOMEM; pthread_mutex_unlock(&cl->mutex); goto done; } else { memset(grow_tokens, 0, grow_size); memcpy(grow_tokens, cl->tokens, cl->tokens_slots * sizeof(struct token *)); free(cl->tokens); cl->tokens = grow_tokens; cl->tokens_slots += (SANLK_MAX_RESOURCES * 2); empty_slots += (SANLK_MAX_RESOURCES * 2); } } memcpy(killpath, cl->killpath, SANLK_HELPER_PATH_LEN); memcpy(killargs, cl->killargs, SANLK_HELPER_ARGS_LEN); pthread_mutex_unlock(&cl->mutex); if (empty_slots < new_tokens_count) { log_error("cmd_acquire %d,%d,%d new %d slots %d", cl_ci, cl_fd, cl_pid, new_tokens_count, empty_slots); result = -ENOENT; goto done; } /* * read resource input and allocate tokens for each */ for (i = 0; i < new_tokens_count; i++) { /* * receive sanlk_resource, create token for it */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv > 0) pos += rv; if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_acquire %d,%d,%d recv res %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; goto done; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto done; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto done; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); if (res.flags & SANLK_RES_SHARED) token->r.flags |= SANLK_RES_SHARED; token->acquire_lver = res.lver; token->acquire_data64 = res.data64; token->acquire_data32 = res.data32; token->acquire_flags = res.flags; /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv > 0) pos += rv; if (rv != disks_len) { log_error("cmd_acquire %d,%d,%d recv disks %d %d", cl_ci, cl_fd, cl_pid, rv, errno); free(token); result = -ENOTCONN; goto done; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } token->token_id = token_id_counter++; new_tokens[i] = token; alloc_count++; } rv = recv(fd, &opt, sizeof(struct sanlk_options), MSG_WAITALL); if (rv > 0) pos += rv; if (rv != sizeof(struct sanlk_options)) { log_error("cmd_acquire %d,%d,%d recv opt %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; goto done; } strncpy(cl->owner_name, opt.owner_name, SANLK_NAME_LEN); if (opt.len) { opt_str = malloc(opt.len); if (!opt_str) { result = -ENOMEM; goto done; } rv = recv(fd, opt_str, opt.len, MSG_WAITALL); if (rv > 0) pos += rv; if (rv != opt.len) { log_error("cmd_acquire %d,%d,%d recv str %d %d", cl_ci, cl_fd, cl_pid, rv, errno); free(opt_str); result = -ENOTCONN; goto done; } } /* TODO: warn if header.length != sizeof(header) + pos ? */ recv_done = 1; /* * all command input has been received, start doing the acquire */ for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; rv = lockspace_info(token->r.lockspace_name, &spi); if (rv < 0 || spi.killing_pids) { log_errot(token, "cmd_acquire %d,%d,%d invalid lockspace " "found %d failed %d name %.48s", cl_ci, cl_fd, cl_pid, rv, spi.killing_pids, token->r.lockspace_name); result = -ENOSPC; goto done; } token->host_id = spi.host_id; token->host_generation = spi.host_generation; token->space_id = spi.space_id; token->pid = cl_pid; token->io_timeout = spi.io_timeout; token->sector_size = spi.sector_size; /* starting hint, may be changed */ token->align_size = spi.align_size; /* starting hint, may be changed */ if (cl->restricted & SANLK_RESTRICT_SIGKILL) token->flags |= T_RESTRICT_SIGKILL; if (cl->restricted & SANLK_RESTRICT_SIGTERM) token->flags |= T_RESTRICT_SIGTERM; } for (i = 0; i < new_tokens_count; i++) { token = new_tokens[i]; rv = acquire_token(task, token, ca->header.cmd_flags, killpath, killargs); if (rv < 0) { switch (rv) { case -EEXIST: case -EAGAIN: case -EBUSY: lvl = LOG_DEBUG; break; case SANLK_ACQUIRE_IDLIVE: case SANLK_ACQUIRE_OWNED: case SANLK_ACQUIRE_OTHER: case SANLK_ACQUIRE_OWNED_RETRY: lvl = com.quiet_fail ? LOG_DEBUG : LOG_ERR; break; default: lvl = LOG_ERR; } if (token->res_id) log_level(token->space_id, token->res_id, NULL, lvl, "cmd_acquire %d,%d,%d acquire_token %d %s", cl_ci, cl_fd, cl_pid, rv, acquire_error_str(rv)); else log_level(token->space_id, 0, NULL, lvl, "cmd_acquire %d,%d,%d acquire_token %s %d %s", cl_ci, cl_fd, cl_pid, token->r.name, rv, acquire_error_str(rv)); result = rv; goto done; } acquire_count++; } /* * Success acquiring the leases: * lock mutex, * 1. if pid is live, move new_tokens to cl->tokens, clear cmd_active, unlock mutex * 2. if pid is dead, clear cmd_active, unlock mutex, release new_tokens, release cl->tokens, client_free * * Failure acquiring the leases: * lock mutex, * 3. if pid is live, clear cmd_active, unlock mutex, release new_tokens * 4. if pid is dead, clear cmd_active, unlock mutex, release new_tokens, release cl->tokens, client_free * * client_pid_dead() won't touch cl->tokens while cmd_active is set. * As soon as we clear cmd_active and unlock the mutex, client_pid_dead * will attempt to clear cl->tokens itself. If we find client_pid_dead * has already happened when we look at pid_dead, then we know that it * won't be called again, and it's our responsibility to clear cl->tokens * and call client_free. */ /* * We hold both space_mutex and cl->mutex at once to create the crucial * linkage between the client pid and the lockspace. Once we release * these two mutexes, if the lockspace fails, this pid will be killed. * Prior to inserting the new_tokens into the client, if the lockspace * fails, kill_pids/client_using_pid would not find this pid (assuming * it doesn't already hold other tokens using the lockspace). If * the lockspace failed while we were acquring the tokens, kill_pids * has already run and not found us, so we must revert what we've done * in acquire. * * Warning: * We could deadlock if we hold cl->mutex and take spaces_mutex, * because all_pids_dead() and kill_pids() hold spaces_mutex and take * cl->mutex. So, lock spaces_mutex first, then cl->mutex to avoid the * deadlock. * * Other approaches: * A solution may be to record in each sp all the pids/cis using it * prior to starting the acquire. Then we would not need to do this * check here to see if the lockspace has been killed (if it was, the * pid for this ci would have been killed in kill_pids), and * all_pids_dead() and kill_pids() would not need to go through each cl * and each cl->token to check if it's using the sp (it would know by * just looking at sp->pids[] and killing each). */ done: pthread_mutex_lock(&spaces_mutex); pthread_mutex_lock(&cl->mutex); log_cmd(cmd, "cmd_acquire %d,%d,%d result %d pid_dead %d", cl_ci, cl_fd, cl_pid, result, cl->pid_dead); pid_dead = cl->pid_dead; cl->cmd_active = 0; if (!result && !pid_dead) { if (check_new_tokens_space(cl, new_tokens, new_tokens_count)) { /* case 1 becomes case 3 */ log_error("cmd_acquire %d,%d,%d invalid lockspace", cl_ci, cl_fd, cl_pid); result = -ENOSPC; } } /* 1. Success acquiring leases, and pid is live */ if (!result && !pid_dead) { for (i = 0; i < new_tokens_count; i++) { for (j = 0; j < cl->tokens_slots; j++) { if (!cl->tokens[j]) { cl->tokens[j] = new_tokens[i]; break; } } } /* goto reply after mutex unlock */ } pthread_mutex_unlock(&cl->mutex); pthread_mutex_unlock(&spaces_mutex); /* 1. Success acquiring leases, and pid is live */ if (!result && !pid_dead) { /* work done before mutex unlock */ goto reply; } /* 2. Success acquiring leases, and pid is dead */ if (!result && pid_dead) { release_new_tokens(task, new_tokens, alloc_count, acquire_count); release_cl_tokens(task, cl); client_free(cl_ci); result = -ENOTTY; goto reply; } /* 3. Failure acquiring leases, and pid is live */ if (result && !pid_dead) { release_new_tokens(task, new_tokens, alloc_count, acquire_count); goto reply; } /* 4. Failure acquiring leases, and pid is dead */ if (result && pid_dead) { release_new_tokens(task, new_tokens, alloc_count, acquire_count); release_cl_tokens(task, cl); client_free(cl_ci); goto reply; } reply: if (!recv_done) client_recv_all(ca->ci_in, &ca->header, pos); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_release(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct client *cl; struct token *token; struct token *rem_tokens[SANLK_MAX_RESOURCES]; struct sanlk_resource res; struct sanlk_resource new; struct sanlk_resource *resrename = NULL; int fd, rv, i, j, found, pid_dead; int rem_tokens_count = 0; int result = 0; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; cl = &client[cl_ci]; fd = client[ca->ci_in].fd; log_cmd(cmd, "cmd_release %d,%d,%d ci_in %d fd %d count %d flags %x", cl_ci, cl_fd, cl_pid, ca->ci_in, fd, ca->header.data, ca->header.cmd_flags); /* caller wants to release all resources */ if (ca->header.cmd_flags & SANLK_REL_ALL) { pthread_mutex_lock(&cl->mutex); for (j = 0; j < cl->tokens_slots; j++) { token = cl->tokens[j]; if (!token) continue; rem_tokens[rem_tokens_count++] = token; cl->tokens[j] = NULL; } pthread_mutex_unlock(&cl->mutex); goto do_remove; } if (ca->header.cmd_flags & SANLK_REL_ORPHAN) { rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_release %d,%d,%d recv res %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; goto do_remove; } result = release_orphan(&res); goto out; } if (ca->header.cmd_flags & SANLK_REL_RENAME) { rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_release %d,%d,%d recv res %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; goto do_remove; } /* second res struct has new name for first res */ rv = recv(fd, &new, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_release %d,%d,%d recv new %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; goto do_remove; } found = 0; pthread_mutex_lock(&cl->mutex); for (j = 0; j < cl->tokens_slots; j++) { token = cl->tokens[j]; if (!token) continue; if (memcmp(token->r.lockspace_name, res.lockspace_name, NAME_ID_SIZE)) continue; if (memcmp(token->r.name, res.name, NAME_ID_SIZE)) continue; rem_tokens[rem_tokens_count++] = token; cl->tokens[j] = NULL; found = 1; break; } pthread_mutex_unlock(&cl->mutex); if (!found) { log_error("cmd_release %d,%d,%d no resource %.48s", cl_ci, cl_fd, cl_pid, res.name); result = -1; } resrename = &new; goto do_remove; } /* caller is specifying specific resources to release */ for (i = 0; i < ca->header.data; i++) { rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_release %d,%d,%d recv res %d %d", cl_ci, cl_fd, cl_pid, rv, errno); result = -ENOTCONN; break; } found = 0; pthread_mutex_lock(&cl->mutex); for (j = 0; j < cl->tokens_slots; j++) { token = cl->tokens[j]; if (!token) continue; if (memcmp(token->r.lockspace_name, res.lockspace_name, NAME_ID_SIZE)) continue; if (memcmp(token->r.name, res.name, NAME_ID_SIZE)) continue; rem_tokens[rem_tokens_count++] = token; cl->tokens[j] = NULL; found = 1; break; } pthread_mutex_unlock(&cl->mutex); if (!found) { log_error("cmd_release %d,%d,%d no resource %.48s", cl_ci, cl_fd, cl_pid, res.name); result = -1; } } do_remove: for (i = 0; i < rem_tokens_count; i++) { token = rem_tokens[i]; rv = release_token(task, token, resrename); if (rv < 0) result = rv; free(token); } out: pthread_mutex_lock(&cl->mutex); log_cmd(cmd, "cmd_release %d,%d,%d result %d pid_dead %d count %d", cl_ci, cl_fd, cl_pid, result, cl->pid_dead, rem_tokens_count); pid_dead = cl->pid_dead; cl->cmd_active = 0; if (!pid_dead && cl->kill_count) { /* * If no tokens are left, clear all cl killing state. The * cl no longer needs to be killed, and the pid may continue * running, even if a failed lockspace it was using is * released. When the lockspace is re-added, the tokens * may be re-acquired for this same cl/pid. */ found = 0; for (j = 0; j < cl->tokens_slots; j++) { if (!cl->tokens[j]) continue; found = 1; break; } if (!found) { cl->kill_count = 0; cl->kill_last = 0; cl->flags &= ~CL_RUNPATH_SENT; log_cmd(cmd, "cmd_release %d,%d,%d clear kill state", cl_ci, cl_fd, cl_pid); } } pthread_mutex_unlock(&cl->mutex); if (pid_dead) { /* release any tokens not already released above */ release_cl_tokens(task, cl); client_free(cl_ci); } send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_inquire(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sm_header h; struct token *token; struct client *cl; char *state = NULL, *str; int state_maxlen = 0, state_strlen = 0; int res_count = 0, cat_count = 0; int fd, i, rv, pid_dead; int result = 0; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; cl = &client[cl_ci]; fd = client[ca->ci_in].fd; log_cmd(cmd, "cmd_inquire %d,%d,%d ci_in %d fd %d", cl_ci, cl_fd, cl_pid, ca->ci_in, fd); pthread_mutex_lock(&cl->mutex); if (cl->pid_dead) { result = -ESTALE; goto done; } for (i = 0; i < cl->tokens_slots; i++) { if (cl->tokens[i]) res_count++; } if (!res_count) { result = 0; goto done; } state_maxlen = res_count * (SANLK_MAX_RES_STR + 1); state = malloc(state_maxlen); if (!state) { result = -ENOMEM; goto done; } memset(state, 0, state_maxlen); /* should match sanlock_args_to_state() */ for (i = 0; i < cl->tokens_slots; i++) { token = cl->tokens[i]; if (!token) continue; /* check number of tokens hasn't changed since first count */ if (cat_count >= res_count) { log_error("cmd_inquire %d,%d,%d count changed %d %d", cl_ci, cl_fd, cl_pid, res_count, cat_count); result = -ENOENT; goto done; } str = NULL; rv = sanlock_res_to_str(&token->r, &str); if (rv < 0 || !str) { log_errot(token, "cmd_inquire %d,%d,%d res_to_str %d", cl_ci, cl_fd, cl_pid, rv); result = -ELIBACC; goto done; } if (strlen(str) > SANLK_MAX_RES_STR - 1) { log_errot(token, "cmd_inquire %d,%d,%d strlen %zu", cl_ci, cl_fd, cl_pid, strlen(str)); free(str); result = -ELIBBAD; goto done; } /* space is str separator, so it's invalid within each str */ if (strstr(str, " ")) { log_errot(token, "cmd_inquire %d,%d,%d str space", cl_ci, cl_fd, cl_pid); free(str); result = -ELIBSCN; goto done; } if (cat_count) strcat(state, " "); strcat(state, str); cat_count++; free(str); } state[state_maxlen - 1] = '\0'; state_strlen = strlen(state); result = 0; done: pid_dead = cl->pid_dead; cl->cmd_active = 0; pthread_mutex_unlock(&cl->mutex); log_cmd(cmd, "cmd_inquire %d,%d,%d result %d pid_dead %d res_count %d cat_count %d strlen %d", cl_ci, cl_fd, cl_pid, result, pid_dead, res_count, cat_count, state_strlen); if (pid_dead) { release_cl_tokens(task, cl); client_free(cl_ci); } memcpy(&h, &ca->header, sizeof(struct sm_header)); h.version = SM_PROTO; h.data = result; h.data2 = res_count; if (state) { h.length = sizeof(h) + state_strlen + 1; send(fd, &h, sizeof(h), MSG_NOSIGNAL); send(fd, state, state_strlen + 1, MSG_NOSIGNAL); free(state); } else { h.length = sizeof(h); send(fd, &h, sizeof(h), MSG_NOSIGNAL); } client_resume(ca->ci_in); } /* * The behavior may be a little iffy in the case where a pid is killed (due to * lockspace failure) while it is doing convert. If the pid responds by * exiting, then this cmd_convert will see pid_dead and release all tokens at * the end. If the pid wants to respond by explicitly releasing its leases, * then this convert should fail and return for the same reason the lockspace * failed. Once the convert returns, the pid can respond to the killpath by * releasing all the leases. * * This sets cmd_active, along with acquire/release/inquire, which means * that it is serialized along with all cmds that set cmd_active, and * cl->tokens will not change while the cmd is active. This also means * it has to handle pid_dead at the end in case the pid exited while the * cmd was active and cl->tokens need to be released. * (killpath also sets cmd_active so that tokens are not acquired * while it's being set.) */ static void cmd_convert(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sanlk_resource res; struct token *token; struct client *cl; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; int pid_dead = 0; int result = 0; int found = 0; int fd, i, rv; cl = &client[cl_ci]; fd = client[ca->ci_in].fd; log_cmd(cmd, "cmd_convert %d,%d,%d ci_in %d fd %d", cl_ci, cl_fd, cl_pid, ca->ci_in, fd); rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { result = -ENOTCONN; goto reply; } pthread_mutex_lock(&cl->mutex); for (i = 0; i < cl->tokens_slots; i++) { token = cl->tokens[i]; if (!token) continue; if (memcmp(token->r.lockspace_name, res.lockspace_name, NAME_ID_SIZE)) continue; if (memcmp(token->r.name, res.name, NAME_ID_SIZE)) continue; found = 1; break; } pthread_mutex_unlock(&cl->mutex); if (!found) { result = -ENOENT; goto cmd_done; } rv = convert_token(task, &res, token, ca->header.cmd_flags); if (rv < 0) result = rv; cmd_done: pthread_mutex_lock(&cl->mutex); pid_dead = cl->pid_dead; cl->cmd_active = 0; pthread_mutex_unlock(&cl->mutex); reply: log_cmd(cmd, "cmd_convert %d,%d,%d result %d pid_dead %d", cl_ci, cl_fd, cl_pid, result, pid_dead); if (pid_dead) { release_cl_tokens(task, cl); client_free(cl_ci); } send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_request(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct token *token; struct sanlk_resource res; struct space_info spi; uint64_t owner_id = 0; uint32_t force_mode; int token_len, disks_len; int j, fd, rv, error, result; fd = client[ca->ci_in].fd; force_mode = ca->header.data; /* receiving and setting up token copied from cmd_acquire */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_request %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto reply; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto reply; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); token->acquire_lver = res.lver; token->acquire_data64 = res.data64; token->acquire_data32 = res.data32; token->acquire_flags = res.flags; /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv != disks_len) { result = -ENOTCONN; goto reply_free; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } log_cmd(cmd, "cmd_request %d,%d force_mode %u %.48s:%.48s:%.256s:%llu", ca->ci_in, fd, force_mode, token->r.lockspace_name, token->r.name, token->disks[0].path, (unsigned long long)token->r.disks[0].offset); rv = lockspace_info(token->r.lockspace_name, &spi); if (rv < 0 || spi.killing_pids) { result = -ENOSPC; goto reply_free; } token->io_timeout = spi.io_timeout; token->sector_size = spi.sector_size; token->align_size = spi.align_size; error = request_token(task, token, force_mode, &owner_id, (ca->header.cmd_flags & SANLK_REQUEST_NEXT_LVER)); if (error < 0) { result = error; goto reply_free; } result = 0; if (!token->acquire_lver && !force_mode) goto reply_free; if (owner_id) host_status_set_bit(token->r.lockspace_name, owner_id); reply_free: free(token); reply: log_cmd(cmd, "cmd_request %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_examine(struct task *task GNUC_UNUSED, struct cmd_args *ca, uint32_t cmd) { union { struct sanlk_resource r; struct sanlk_lockspace s; } buf; struct sanlk_resource *res = NULL; struct sanlk_lockspace *ls = NULL; char *space_name = NULL; char *res_name = NULL; int fd, rv, result, count = 0, datalen; fd = client[ca->ci_in].fd; if (ca->header.cmd == SM_CMD_EXAMINE_RESOURCE) { datalen = sizeof(struct sanlk_resource); res = &buf.r; } else { datalen = sizeof(struct sanlk_lockspace); ls = &buf.s; } rv = recv(fd, &buf, datalen, MSG_WAITALL); if (rv != datalen) { log_error("cmd_examine %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (res) { space_name = res->lockspace_name; res_name = res->name; } else { space_name = ls->name; } log_cmd(cmd, "cmd_examine %d,%d %.48s %.48s", ca->ci_in, fd, space_name, res_name ? res_name : ""); count = set_resource_examine(space_name, res_name); result = 0; reply: log_cmd(cmd, "cmd_examine %d,%d done %d", ca->ci_in, fd, count); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_set_lvb(struct task *task GNUC_UNUSED, struct cmd_args *ca, uint32_t cmd) { struct sanlk_resource res; char *lvb = NULL; int lvblen, rv, fd, result; fd = client[ca->ci_in].fd; rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_set_lvb %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } lvblen = ca->header.length - sizeof(struct sm_header) - sizeof(struct sanlk_resource); /* 4096 is the max sector size we handle, it is compared against the actual 512/4K sector size in res_set_lvb. */ if (lvblen > 4096) { log_error("cmd_set_lvb %d,%d lvblen %d too big", ca->ci_in, fd, lvblen); result = -E2BIG; goto reply; } lvb = malloc(lvblen); if (!lvb) { result = -ENOMEM; goto reply; } rv = recv(fd, lvb, lvblen, MSG_WAITALL); if (rv != lvblen) { log_error("cmd_set_lvb %d,%d recv lvblen %d lvb %d %d", ca->ci_in, fd, lvblen, rv, errno); result = -ENOTCONN; goto reply; } result = res_set_lvb(&res, lvb, lvblen); log_cmd(cmd, "cmd_set_lvb ci %d fd %d result %d res %s:%s", ca->ci_in, fd, result, res.lockspace_name, res.name); reply: if (lvb) free(lvb); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_get_lvb(struct task *task GNUC_UNUSED, struct cmd_args *ca, uint32_t cmd) { struct sm_header h; struct sanlk_resource res; char *lvb = NULL; int lvblen = 0, rv, fd, result; fd = client[ca->ci_in].fd; rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_get_lvb %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } /* if 0 then we use the sector size as lvb len */ lvblen = ca->header.data2; result = res_get_lvb(&res, &lvb, &lvblen); log_cmd(cmd, "cmd_get_lvb ci %d fd %d result %d res %s:%s", ca->ci_in, fd, result, res.lockspace_name, res.name); reply: memcpy(&h, &ca->header, sizeof(struct sm_header)); h.version = SM_PROTO; h.data = result; h.data2 = 0; h.length = sizeof(h) + lvblen; send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (lvb) { send(fd, lvb, lvblen, MSG_NOSIGNAL); free(lvb); } client_resume(ca->ci_in); } static int shutdown_reply_ci = -1; static int shutdown_reply_fd = -1; static int daemon_shutdown_start(int ci, int fd, int force) { int rv; if (force) { shutdown_reply_ci = ci; shutdown_reply_fd = fd; external_shutdown = 2; return 0; } pthread_mutex_lock(&spaces_mutex); if (list_empty(&spaces) && list_empty(&spaces_rem) && list_empty(&spaces_add)) { shutdown_reply_ci = ci; shutdown_reply_fd = fd; external_shutdown = 1; rv = 0; } else { rv = -EBUSY; } pthread_mutex_unlock(&spaces_mutex); return rv; } static void cmd_shutdown_wait(struct task *task GNUC_UNUSED, struct cmd_args *ca, uint32_t cmd) { int fd, result; fd = client[ca->ci_in].fd; result = daemon_shutdown_start(ca->ci_in, fd, ca->header.data); /* * daemon_shutdown_reply will send the result at the * end of main_loop. */ if (!result) return; send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } void daemon_shutdown_reply(void) { struct sm_header h; /* shutdown wait was not used */ if (shutdown_reply_fd == -1) return; memset(&h, 0, sizeof(h)); h.magic = SM_MAGIC; h.version = SM_PROTO; h.length = sizeof(h); send(shutdown_reply_fd, &h, sizeof(h), MSG_NOSIGNAL); close(shutdown_reply_fd); client_resume(shutdown_reply_ci); } static void cmd_add_lockspace(struct cmd_args *ca, uint32_t cmd) { struct sanlk_lockspace lockspace; struct space *sp; uint32_t io_timeout; int async = ca->header.cmd_flags & SANLK_ADD_ASYNC; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_add_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_add_lockspace %d,%d %.48s:%llu:%s:%llu flags %x timeout %u", ca->ci_in, fd, lockspace.name, (unsigned long long)lockspace.host_id, lockspace.host_id_disk.path, (unsigned long long)lockspace.host_id_disk.offset, ca->header.cmd_flags, ca->header.data); io_timeout = ca->header.data; if (!io_timeout) io_timeout = DEFAULT_IO_TIMEOUT; rv = add_lockspace_start(&lockspace, io_timeout, &sp); if (rv < 0) { result = rv; goto reply; } if (async) { result = rv; log_cmd(cmd, "cmd_add_lockspace %d,%d async done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); add_lockspace_wait(sp); return; } result = add_lockspace_wait(sp); reply: log_cmd(cmd, "cmd_add_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_inq_lockspace(struct cmd_args *ca, uint32_t cmd) { struct sanlk_lockspace lockspace; int waitrs = ca->header.cmd_flags & SANLK_INQ_WAIT; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_inq_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_inq_lockspace %d,%d %.48s:%llu:%s:%llu flags %x", ca->ci_in, fd, lockspace.name, (unsigned long long)lockspace.host_id, lockspace.host_id_disk.path, (unsigned long long)lockspace.host_id_disk.offset, ca->header.cmd_flags); while (1) { result = inq_lockspace(&lockspace); if ((result != -EINPROGRESS) || !(waitrs)) { break; } sleep(1); } reply: log_cmd(cmd, "cmd_inq_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } /* * TODO: rem_lockspace works like a renewal failure would, and abandons * resource leases (tokens) without releasing them. Unlike the renewal * failure case, rem_lockspace most likely releases the host_id. * * What might be nice is an option where rem_lockspace would try to * release resource leases before releasing the lockspace host_id. * (We don't really want to be releasing tokens after we've released * our host_id for the token's lockspace.) * * - kill all pids (by looking at struct resource pid?) * - wait for all pids to exit * o have us or other thread release their tokens/resources * o wait for tokens/resources to be released, although the release * may fail or time out, we don't want to wait too long * - set sp->external_remove * - main_loop sets sp->thread_stop (should find no pids) * - main_loop unlinks watchdog * - lockspace_thread releases host_id * * The aim is that we kill pids and wait for resources to be released * before main_loop gets involved and before the lockspace_thread is * told to stop. * * An alternative messy is to add another condition to the current * main_loop checks: * * if (sp->killing_pids && all_pids_dead(sp) && all_tokens_released(sp)) { * sp->thread_stop = 1; * deactivate_watchdog(sp); * list_move(spaces_rem); * } * * all_tokens_released would just return 1 in case we're not doing * the releases * * release_token_async would need to learn to put the resources onto * dispose list in this case * * consider using the resources/dispose_resources list for all_pids_dead * and kill_pids? instead of the clients[].tokens[] loops? actually, * could we remove tokens and cl->tokens altogether and just use the * resources list? */ static void cmd_rem_lockspace(struct cmd_args *ca, uint32_t cmd) { struct sanlk_lockspace lockspace; int async = ca->header.cmd_flags & SANLK_REM_ASYNC; int fd, rv, result; unsigned int space_id; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_rem_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_rem_lockspace %d,%d %.48s flags %x", ca->ci_in, fd, lockspace.name, ca->header.cmd_flags); if (ca->header.cmd_flags & SANLK_REM_UNUSED) { if (lockspace_is_used(&lockspace)) { result = -EBUSY; goto reply; } } rv = rem_lockspace_start(&lockspace, &space_id); if (rv < 0) { result = rv; goto reply; } if (async) { result = rv; log_cmd(cmd, "cmd_rem_lockspace %d,%d async done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); rem_lockspace_wait(&lockspace, space_id); return; } result = rem_lockspace_wait(&lockspace, space_id); reply: log_cmd(cmd, "cmd_rem_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_align(struct task *task GNUC_UNUSED, struct cmd_args *ca, uint32_t cmd) { struct sanlk_disk disk; struct sync_disk sd; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &disk, sizeof(struct sanlk_disk), MSG_WAITALL); if (rv != sizeof(struct sanlk_disk)) { log_error("cmd_align %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_align %d,%d", ca->ci_in, fd); if (!disk.path[0]) { result = -ENODEV; goto reply; } memset(&sd, 0, sizeof(struct sync_disk)); memcpy(&sd, &disk, sizeof(struct sanlk_disk)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) { result = -ENODEV; goto reply; } result = direct_align(&sd); close_disks(&sd, 1); reply: log_cmd(cmd, "cmd_align %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_read_lockspace(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sm_header h; struct sanlk_lockspace lockspace; struct sync_disk sd; uint64_t host_id; int sector_size = 0; int align_size = 0; int io_timeout = 0; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_read_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (!lockspace.host_id) host_id = 1; else host_id = lockspace.host_id; log_cmd(cmd, "cmd_read_lockspace %d,%d %llu %s:%llu", ca->ci_in, fd, (unsigned long long)host_id, lockspace.host_id_disk.path, (unsigned long long)lockspace.host_id_disk.offset); if (!lockspace.host_id_disk.path[0]) { result = -ENODEV; goto reply; } memset(&sd, 0, sizeof(struct sync_disk)); memcpy(&sd, &lockspace.host_id_disk, sizeof(struct sanlk_disk)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) { result = -ENODEV; goto reply; } sector_size = sanlk_lsf_sector_flag_to_size(lockspace.flags); align_size = sanlk_lsf_align_flag_to_size(lockspace.flags); if (!sector_size) { /* reads the first leader record to get sector size */ result = delta_read_lockspace_sizes(task, &sd, DEFAULT_IO_TIMEOUT, §or_size, &align_size); if (result < 0) goto out_close; if ((sector_size != 512) && (sector_size != 4096)) { result = -EINVAL; goto out_close; } } /* sets ls->name and io_timeout */ result = delta_read_lockspace(task, &sd, sector_size, align_size, host_id, &lockspace, DEFAULT_IO_TIMEOUT, &io_timeout); if (result == SANLK_OK) result = 0; out_close: close_disks(&sd, 1); reply: log_cmd(cmd, "cmd_read_lockspace %d,%d done %d", ca->ci_in, fd, result); memcpy(&h, &ca->header, sizeof(struct sm_header)); h.version = SM_PROTO; h.data = result; h.data2 = io_timeout; h.length = sizeof(h) + sizeof(lockspace); send(fd, &h, sizeof(h), MSG_NOSIGNAL); send(fd, &lockspace, sizeof(lockspace), MSG_NOSIGNAL); client_resume(ca->ci_in); } static void cmd_read_resource(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sm_header h; struct sanlk_resource res; struct token *token = NULL; int token_len, disks_len; int j, fd, rv, result; fd = client[ca->ci_in].fd; /* receiving and setting up token copied from cmd_acquire */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_read_resource %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto reply; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto reply; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv != disks_len) { result = -ENOTCONN; goto reply; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } log_cmd(cmd, "cmd_read_resource %d,%d %.256s:%llu", ca->ci_in, fd, token->disks[0].path, (unsigned long long)token->r.disks[0].offset); rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { result = rv; goto reply; } token->io_timeout = DEFAULT_IO_TIMEOUT; /* * These may be zero, in which case paxos_read_resource reads a 4K sector * and gets the values from the leader record. */ token->sector_size = sanlk_res_sector_flag_to_size(res.flags); token->align_size = sanlk_res_align_flag_to_size(res.flags); /* sets res.lockspace_name, res.name, res.lver, res.flags */ result = paxos_read_resource(task, token, &res); if (result == SANLK_OK) result = 0; close_disks(token->disks, token->r.num_disks); reply: if (token) free(token); log_cmd(cmd, "cmd_read_resource %d,%d done %d", ca->ci_in, fd, result); memcpy(&h, &ca->header, sizeof(struct sm_header)); h.version = SM_PROTO; h.data = result; h.data2 = 0; h.length = sizeof(h) + sizeof(res); send(fd, &h, sizeof(h), MSG_NOSIGNAL); send(fd, &res, sizeof(res), MSG_NOSIGNAL); client_resume(ca->ci_in); } static void cmd_read_resource_owners(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sm_header h; struct sanlk_resource res; struct token *token = NULL; char *send_buf; int token_len, disks_len, send_len = 0; int j, fd, rv, result, count = 0; fd = client[ca->ci_in].fd; /* receiving and setting up token copied from cmd_acquire */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_read_resource_owners %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto reply; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto reply; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv != disks_len) { result = -ENOTCONN; goto reply; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } log_cmd(cmd, "cmd_read_resource_owners %d,%d %.256s:%llu", ca->ci_in, fd, token->disks[0].path, (unsigned long long)token->r.disks[0].offset); rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { result = rv; goto reply; } token->io_timeout = DEFAULT_IO_TIMEOUT; /* * These may be zero, in which case paxos_read_resource reads a 4K sector * and gets the values from the leader record. */ token->sector_size = sanlk_res_sector_flag_to_size(res.flags); token->align_size = sanlk_res_align_flag_to_size(res.flags); send_buf = NULL; send_len = 0; result = read_resource_owners(task, token, &res, &send_buf, &send_len, &count); if (result == SANLK_OK) result = 0; close_disks(token->disks, token->r.num_disks); reply: if (token) free(token); log_cmd(cmd, "cmd_read_resource_owners %d,%d count %d done %d", ca->ci_in, fd, count, result); memcpy(&h, &ca->header, sizeof(struct sm_header)); h.version = SM_PROTO; h.data = result; h.data2 = count; h.length = sizeof(h) + sizeof(res) + send_len; send(fd, &h, sizeof(h), MSG_NOSIGNAL); send(fd, &res, sizeof(res), MSG_NOSIGNAL); if (send_len && send_buf) { send(fd, send_buf, send_len, MSG_NOSIGNAL); free(send_buf); } client_resume(ca->ci_in); } static void cmd_write_lockspace(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sanlk_lockspace lockspace; struct sync_disk sd; int fd, rv, result; int io_timeout = DEFAULT_IO_TIMEOUT; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { log_error("cmd_write_lockspace %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_write_lockspace %d,%d %.48s:%llu:%s:%llu 0x%x", ca->ci_in, fd, lockspace.name, (unsigned long long)lockspace.host_id, lockspace.host_id_disk.path, (unsigned long long)lockspace.host_id_disk.offset, lockspace.flags); if (!lockspace.host_id_disk.path[0]) { result = -ENODEV; goto reply; } /* No longer used, max_hosts is derived from sector/align sizes. */ /* max_hosts = ca->header.data; */ memset(&sd, 0, sizeof(struct sync_disk)); memcpy(&sd, &lockspace.host_id_disk, sizeof(struct sanlk_disk)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) { result = -ENODEV; goto reply; } if (ca->header.data2) io_timeout = ca->header.data2; result = delta_lease_init(task, &lockspace, io_timeout, &sd); close_disks(&sd, 1); reply: log_cmd(cmd, "cmd_write_lockspace %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_write_resource(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct token *token = NULL; struct sanlk_resource res; int token_len, disks_len; int num_hosts; int write_clear = 0; int j, fd, rv, result; fd = client[ca->ci_in].fd; /* receiving and setting up token copied from cmd_acquire */ rv = recv(fd, &res, sizeof(struct sanlk_resource), MSG_WAITALL); if (rv != sizeof(struct sanlk_resource)) { log_error("cmd_write_resource %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } if (!res.num_disks || res.num_disks > SANLK_MAX_DISKS) { result = -ERANGE; goto reply; } disks_len = res.num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) { result = -ENOMEM; goto reply; } memset(token, 0, token_len); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ token->r.num_disks = res.num_disks; memcpy(token->r.lockspace_name, res.lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res.name, SANLK_NAME_LEN); token->r.flags = res.flags; /* * receive sanlk_disk's / sync_disk's * * WARNING: as a shortcut, this requires that sync_disk and * sanlk_disk match; this is the reason for the pad fields * in sanlk_disk (TODO: let these differ?) */ rv = recv(fd, token->disks, disks_len, MSG_WAITALL); if (rv != disks_len) { result = -ENOTCONN; goto reply; } /* zero out pad1 and pad2, see WARNING above */ for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } log_cmd(cmd, "cmd_write_resource %d,%d %.48s:%.48s:%.256s:%llu 0x%x", ca->ci_in, fd, token->r.lockspace_name, token->r.name, token->disks[0].path, (unsigned long long)token->r.disks[0].offset, res.flags); num_hosts = ca->header.data; /* No longer used, max_hosts is derived from sector/align sizes. */ /* max_hosts = ca->header.data2; */ if (ca->header.cmd_flags & SANLK_WRITE_CLEAR) write_clear = 1; rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { result = rv; goto reply; } token->io_timeout = DEFAULT_IO_TIMEOUT; result = paxos_lease_init(task, token, num_hosts, write_clear); close_disks(token->disks, token->r.num_disks); reply: if (token) free(token); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } /* N.B. the api doesn't support one client setting killpath for another pid/client */ static void cmd_killpath(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct client *cl; int cl_ci = ca->ci_target; int cl_fd = ca->cl_fd; int cl_pid = ca->cl_pid; int rv, result, pid_dead; cl = &client[cl_ci]; log_cmd(cmd, "cmd_killpath %d,%d,%d flags %x", cl_ci, cl_fd, cl_pid, ca->header.cmd_flags); rv = recv(cl_fd, cl->killpath, SANLK_HELPER_PATH_LEN, MSG_WAITALL); if (rv != SANLK_HELPER_PATH_LEN) { log_error("cmd_killpath %d,%d,%d recv path %d %d", cl_ci, cl_fd, cl_pid, rv, errno); memset(cl->killpath, 0, SANLK_HELPER_PATH_LEN); memset(cl->killargs, 0, SANLK_HELPER_ARGS_LEN); result = -ENOTCONN; goto done; } rv = recv(cl_fd, cl->killargs, SANLK_HELPER_ARGS_LEN, MSG_WAITALL); if (rv != SANLK_HELPER_ARGS_LEN) { log_error("cmd_killpath %d,%d,%d recv args %d %d", cl_ci, cl_fd, cl_pid, rv, errno); memset(cl->killpath, 0, SANLK_HELPER_PATH_LEN); memset(cl->killargs, 0, SANLK_HELPER_ARGS_LEN); result = -ENOTCONN; goto done; } cl->killpath[SANLK_HELPER_PATH_LEN - 1] = '\0'; cl->killargs[SANLK_HELPER_ARGS_LEN - 1] = '\0'; if (ca->header.cmd_flags & SANLK_KILLPATH_PID) cl->flags |= CL_KILLPATH_PID; result = 0; done: pthread_mutex_lock(&cl->mutex); pid_dead = cl->pid_dead; cl->cmd_active = 0; pthread_mutex_unlock(&cl->mutex); if (pid_dead) { /* release tokens in case a client sets/changes its killpath after it has acquired leases */ release_cl_tokens(task, cl); client_free(cl_ci); return; } send_result(ca->ci_in, cl_fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_set_event(struct task *task GNUC_UNUSED, struct cmd_args *ca, uint32_t cmd) { struct sanlk_lockspace lockspace; struct sanlk_host_event he; int rv, fd, result; fd = client[ca->ci_in].fd; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { result = -ENOTCONN; goto reply; } rv = recv(fd, &he, sizeof(struct sanlk_host_event), MSG_WAITALL); if (rv != sizeof(struct sanlk_host_event)) { result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_set_event %.48s", lockspace.name); result = lockspace_set_event(&lockspace, &he, ca->header.cmd_flags); log_cmd(cmd, "cmd_set_event result %d", result); reply: send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_format_rindex(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sanlk_rindex ri; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &ri, sizeof(struct sanlk_rindex), MSG_WAITALL); if (rv != sizeof(struct sanlk_rindex)) { log_error("cmd_format_rindex %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_format_rindex %d,%d %.48s %s:%llu", ca->ci_in, fd, ri.lockspace_name, ri.disk.path, (unsigned long long)ri.disk.offset); result = rindex_format(task, &ri); reply: log_cmd(cmd, "cmd_format_rindex %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void cmd_rebuild_rindex(struct task *task, struct cmd_args *ca, uint32_t cmd) { struct sanlk_rindex ri; int fd, rv, result; fd = client[ca->ci_in].fd; rv = recv(fd, &ri, sizeof(struct sanlk_rindex), MSG_WAITALL); if (rv != sizeof(struct sanlk_rindex)) { log_error("cmd_rebuild_rindex %d,%d recv %d %d", ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "cmd_rebuild_rindex %d,%d %.48s %s:%llu", ca->ci_in, fd, ri.lockspace_name, ri.disk.path, (unsigned long long)ri.disk.offset); result = rindex_rebuild(task, &ri, ca->header.cmd_flags); reply: log_cmd(cmd, "cmd_rebuild_rindex %d,%d done %d", ca->ci_in, fd, result); send_result(ca->ci_in, fd, &ca->header, result); client_resume(ca->ci_in); } static void rindex_op(struct task *task, struct cmd_args *ca, const char *ri_cmd_str, int op, uint32_t cmd) { struct sanlk_rindex ri; struct sanlk_rentry re; struct sanlk_rentry re_ret; struct sm_header h; int fd, rv, result; memset(&re_ret, 0, sizeof(re_ret)); fd = client[ca->ci_in].fd; rv = recv(fd, &ri, sizeof(struct sanlk_rindex), MSG_WAITALL); if (rv != sizeof(struct sanlk_rindex)) { log_error("%s %d,%d recv %d %d", ri_cmd_str, ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } rv = recv(fd, &re, sizeof(struct sanlk_rentry), MSG_WAITALL); if (rv != sizeof(struct sanlk_rentry)) { log_error("%s %d,%d recv %d %d", ri_cmd_str, ca->ci_in, fd, rv, errno); result = -ENOTCONN; goto reply; } log_cmd(cmd, "%s %d,%d %.48s %s:%llu", ri_cmd_str, ca->ci_in, fd, ri.lockspace_name, ri.disk.path, (unsigned long long)ri.disk.offset); if (op == RX_OP_LOOKUP) result = rindex_lookup(task, &ri, &re, &re_ret, ca->header.cmd_flags); else if (op == RX_OP_UPDATE) result = rindex_update(task, &ri, &re, &re_ret, ca->header.cmd_flags); else if (op == RX_OP_CREATE) result = rindex_create(task, &ri, &re, &re_ret, ca->header.data, ca->header.data2); else if (op == RX_OP_DELETE) result = rindex_delete(task, &ri, &re, &re_ret); else result = -EINVAL; reply: log_cmd(cmd, "%s %d,%d done %d", ri_cmd_str, ca->ci_in, fd, result); memcpy(&h, &ca->header, sizeof(struct sm_header)); h.version = SM_PROTO; h.data = result; h.data2 = 0; h.length = sizeof(h) + sizeof(re_ret); send(fd, &h, sizeof(h), MSG_NOSIGNAL); send(fd, &re_ret, sizeof(re), MSG_NOSIGNAL); client_resume(ca->ci_in); } void call_cmd_thread(struct task *task, struct cmd_args *ca) { uint32_t cmd = ca->header.cmd; switch (cmd) { case SM_CMD_ACQUIRE: cmd_acquire(task, ca, cmd); break; case SM_CMD_RELEASE: cmd_release(task, ca, cmd); break; case SM_CMD_INQUIRE: cmd_inquire(task, ca, cmd); break; case SM_CMD_CONVERT: cmd_convert(task, ca, cmd); break; case SM_CMD_REQUEST: cmd_request(task, ca, cmd); break; case SM_CMD_ADD_LOCKSPACE: strcpy(client[ca->ci_in].owner_name, "add_lockspace"); cmd_add_lockspace(ca, cmd); break; case SM_CMD_INQ_LOCKSPACE: strcpy(client[ca->ci_in].owner_name, "inq_lockspace"); cmd_inq_lockspace(ca, cmd); break; case SM_CMD_REM_LOCKSPACE: strcpy(client[ca->ci_in].owner_name, "rem_lockspace"); cmd_rem_lockspace(ca, cmd); break; case SM_CMD_ALIGN: cmd_align(task, ca, cmd); break; case SM_CMD_WRITE_LOCKSPACE: cmd_write_lockspace(task, ca, cmd); break; case SM_CMD_WRITE_RESOURCE: cmd_write_resource(task, ca, cmd); break; case SM_CMD_READ_LOCKSPACE: cmd_read_lockspace(task, ca, cmd); break; case SM_CMD_READ_RESOURCE: cmd_read_resource(task, ca, cmd); break; case SM_CMD_READ_RESOURCE_OWNERS: cmd_read_resource_owners(task, ca, cmd); break; case SM_CMD_EXAMINE_LOCKSPACE: case SM_CMD_EXAMINE_RESOURCE: cmd_examine(task, ca, cmd); break; case SM_CMD_KILLPATH: cmd_killpath(task, ca, cmd); break; case SM_CMD_SET_LVB: cmd_set_lvb(task, ca, cmd); break; case SM_CMD_GET_LVB: cmd_get_lvb(task, ca, cmd); break; case SM_CMD_SHUTDOWN_WAIT: cmd_shutdown_wait(task, ca, cmd); break; case SM_CMD_SET_EVENT: cmd_set_event(task, ca, cmd); break; case SM_CMD_FORMAT_RINDEX: cmd_format_rindex(task, ca, cmd); break; case SM_CMD_REBUILD_RINDEX: cmd_rebuild_rindex(task, ca, cmd); break; case SM_CMD_UPDATE_RINDEX: rindex_op(task, ca, "cmd_update_rindex", RX_OP_UPDATE, cmd); break; case SM_CMD_LOOKUP_RINDEX: rindex_op(task, ca, "cmd_lookup_rindex", RX_OP_LOOKUP, cmd); break; case SM_CMD_CREATE_RESOURCE: rindex_op(task, ca, "cmd_create_resource", RX_OP_CREATE, cmd); break; case SM_CMD_DELETE_RESOURCE: rindex_op(task, ca, "cmd_delete_resource", RX_OP_DELETE, cmd); break; }; } /* * sanlock client status * * 1. send_state_daemon * * 2. for each cl in clients * send_state_client() [sanlk_state + str_len] * * 3. for each sp in spaces, spaces_add, spaces_rem * send_state_lockspace() [sanlk_state + str_len + sanlk_lockspace] * * 4. for each r in resources, dispose_resources * send_state_resource() [sanlk_state + str_len + sanlk_resource + sanlk_disk * num_disks] * * sanlock client host_status * * 1. for each hs in sp->host_status * send_state_host() */ static int print_state_daemon(char *str) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "our_host_name=%s " "use_watchdog=%d " "high_priority=%d " "mlock_level=%d " "quiet_fail=%d " "debug_renew=%d " "debug_clients=%d " "debug_cmds=0x%llx " "renewal_history_size=%d " "gid=%d " "uid=%d " "sh_retries=%d " "max_sectors_kb_ignore=%d " "max_sectors_kb_align=%d " "max_sectors_kb_num=%d " "write_init_io_timeout=%u " "use_aio=%d " "kill_grace_seconds=%d " "helper_pid=%d " "helper_kill_fd=%d " "helper_full_count=%u " "helper_last_status=%llu " "monotime=%llu " "version_str=%s " "version_num=%u.%u.%u " "version_hex=%08x " "smproto_hex=%08x", our_host_name_global, com.use_watchdog, com.high_priority, com.mlock_level, com.quiet_fail, com.debug_renew, com.debug_clients, (unsigned long long)com.debug_cmds, com.renewal_history_size, com.gid, com.uid, com.sh_retries, com.max_sectors_kb_ignore, com.max_sectors_kb_align, com.max_sectors_kb_num, com.write_init_io_timeout, main_task.use_aio, kill_grace_seconds, helper_pid, helper_kill_fd, helper_full_count, (unsigned long long)helper_last_status, (unsigned long long)monotime(), VERSION, sanlock_version_major, sanlock_version_minor, sanlock_version_patch, sanlock_version_combined, SM_PROTO); return strlen(str) + 1; } static int print_state_client(struct client *cl, int ci, char *str) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "ci=%d " "fd=%d " "pid=%d " "flags=%x " "restricted=%x " "cmd_active=%d " "cmd_last=%d " "pid_dead=%d " "kill_count=%d " "kill_last=%llu " "suspend=%d " "need_free=%d", ci, cl->fd, cl->pid, cl->flags, cl->restricted, cl->cmd_active, cl->cmd_last, cl->pid_dead, cl->kill_count, (unsigned long long)cl->kill_last, cl->suspend, cl->need_free); return strlen(str) + 1; } static int print_state_lockspace(struct space *sp, char *str, const char *list_name) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "list=%s " "space_id=%u " "io_timeout=%d " "sector_size=%d " "align_size=%d " "host_generation=%llu " "renew_fail=%d " "space_dead=%d " "killing_pids=%d " "used_retries=%u " "external_used=%d " "used_by_orphans=%d " "renewal_read_extend_sec=%u " "corrupt_result=%d " "acquire_last_result=%d " "renewal_last_result=%d " "acquire_last_attempt=%llu " "acquire_last_success=%llu " "renewal_last_attempt=%llu " "renewal_last_success=%llu", list_name, sp->space_id, sp->io_timeout, sp->sector_size, sp->align_size, (unsigned long long)sp->host_generation, sp->renew_fail, sp->space_dead, sp->killing_pids, sp->used_retries, (sp->flags & SP_EXTERNAL_USED) ? 1 : 0, (sp->flags & SP_USED_BY_ORPHANS) ? 1 : 0, sp->renewal_read_extend_sec, sp->lease_status.corrupt_result, sp->lease_status.acquire_last_result, sp->lease_status.renewal_last_result, (unsigned long long)sp->lease_status.acquire_last_attempt, (unsigned long long)sp->lease_status.acquire_last_success, (unsigned long long)sp->lease_status.renewal_last_attempt, (unsigned long long)sp->lease_status.renewal_last_success); return strlen(str) + 1; } static int print_state_resource(struct resource *r, char *str, const char *list_name, uint32_t token_id) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "list=%s " "flags=%x " "sector_size=%d " "align_size=%d " "lver=%llu " "reused=%u " "res_id=%u " "token_id=%u", list_name, r->flags, r->sector_size, r->align_size, (unsigned long long)r->leader.lver, r->reused, r->res_id, token_id); return strlen(str) + 1; } static int print_state_host(struct host_status *hs, char *str) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "last_check=%llu " "last_live=%llu " "last_req=%llu " "owner_id=%llu " "owner_generation=%llu " "timestamp=%llu " "io_timeout=%u " "owner_name=%.48s", (unsigned long long)hs->last_check, (unsigned long long)hs->last_live, (unsigned long long)hs->last_req, (unsigned long long)hs->owner_id, (unsigned long long)hs->owner_generation, (unsigned long long)hs->timestamp, hs->io_timeout, hs->owner_name); return strlen(str) + 1; } static int print_state_renewal(struct renewal_history *hi, char *str) { memset(str, 0, SANLK_STATE_MAXSTR); snprintf(str, SANLK_STATE_MAXSTR-1, "timestamp=%llu " "read_ms=%d " "write_ms=%d " "next_timeouts=%d " "next_errors=%d", (unsigned long long)hi->timestamp, hi->read_ms, hi->write_ms, hi->next_timeouts, hi->next_errors); return strlen(str) + 1; } static void send_state_daemon(int fd) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); strncpy(st.name, our_host_name_global, NAME_ID_SIZE); st.type = SANLK_STATE_DAEMON; str_len = print_state_daemon(str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } static void send_state_client(int fd, struct client *cl, int ci) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_CLIENT; st.data32 = cl->pid; strncpy(st.name, cl->owner_name, NAME_ID_SIZE); str_len = print_state_client(cl, ci, str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } static void send_state_lockspace(int fd, struct space *sp, const char *list_name) { struct sanlk_state st; struct sanlk_lockspace lockspace; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_LOCKSPACE; st.data64 = sp->host_id; strncpy(st.name, sp->space_name, NAME_ID_SIZE); str_len = print_state_lockspace(sp, str, list_name); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); memset(&lockspace, 0, sizeof(struct sanlk_lockspace)); strncpy(lockspace.name, sp->space_name, NAME_ID_SIZE); lockspace.host_id = sp->host_id; memcpy(&lockspace.host_id_disk, &sp->host_id_disk, sizeof(struct sanlk_disk)); send(fd, &lockspace, sizeof(lockspace), MSG_NOSIGNAL); } void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id); void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; int i; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_RESOURCE; st.data32 = pid; st.data64 = r->leader.lver; strncpy(st.name, r->r.name, NAME_ID_SIZE); str_len = print_state_resource(r, str, list_name, token_id); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); send(fd, &r->r, sizeof(struct sanlk_resource), MSG_NOSIGNAL); for (i = 0; i < r->r.num_disks; i++) { send(fd, &r->r.disks[i], sizeof(struct sanlk_disk), MSG_NOSIGNAL); } } static void send_state_host(int fd, struct host_status *hs, int host_id) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_HOST; st.data32 = host_id; st.data64 = hs->timestamp; str_len = print_state_host(hs, str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } static void send_state_renewal(int fd, struct renewal_history *hi) { struct sanlk_state st; char str[SANLK_STATE_MAXSTR]; int str_len; memset(&st, 0, sizeof(st)); st.type = SANLK_STATE_RENEWAL; st.data64 = hi->timestamp; str_len = print_state_renewal(hi, str); st.str_len = str_len; send(fd, &st, sizeof(st), MSG_NOSIGNAL); if (str_len) send(fd, str, str_len, MSG_NOSIGNAL); } static void cmd_status(int ci, int fd, struct sm_header *h_recv, int client_maxi, uint32_t cmd) { struct sm_header h; struct client *cl; struct space *sp; int ci_iter; log_cmd(cmd, "cmd_status %d,%d", ci, fd); memset(&h, 0, sizeof(h)); memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(h); h.data = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); send_state_daemon(fd); if (h_recv->data == SANLK_STATE_DAEMON) return; for (ci_iter = 0; ci_iter <= client_maxi; ci_iter++) { cl = &client[ci_iter]; if (!cl->used) continue; send_state_client(fd, cl, ci_iter); } if (h_recv->data == SANLK_STATE_CLIENT) return; /* N.B. the reporting function looks for the strings "add" and "rem", so if changed, the strings should be changed in both places. */ pthread_mutex_lock(&spaces_mutex); list_for_each_entry(sp, &spaces, list) send_state_lockspace(fd, sp, "spaces"); list_for_each_entry(sp, &spaces_add, list) send_state_lockspace(fd, sp, "add"); list_for_each_entry(sp, &spaces_rem, list) send_state_lockspace(fd, sp, "rem"); pthread_mutex_unlock(&spaces_mutex); if (h_recv->data == SANLK_STATE_LOCKSPACE) return; /* resource.c will iterate through private lists and call back here for each r */ send_state_resources(fd); } static void cmd_host_status(int ci, int fd, struct sm_header *h_recv, uint32_t cmd) { struct sm_header h; struct sanlk_lockspace lockspace; struct space *sp; struct host_status *hs, *status = NULL; int status_len; int i, rv; log_cmd(cmd, "cmd_host_status %d,%d", ci, fd); memset(&h, 0, sizeof(h)); memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(h); h.data = 0; status_len = sizeof(struct host_status) * DEFAULT_MAX_HOSTS; status = malloc(status_len); if (!status) { h.data = -ENOMEM; goto fail; } rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { h.data = -ENOTCONN; goto fail; } pthread_mutex_lock(&spaces_mutex); sp = find_lockspace(lockspace.name); if (sp) memcpy(status, &sp->host_status, status_len); pthread_mutex_unlock(&spaces_mutex); if (!sp) { h.data = -ENOSPC; goto fail; } send(fd, &h, sizeof(h), MSG_NOSIGNAL); for (i = 0; i < DEFAULT_MAX_HOSTS; i++) { hs = &status[i]; if (!hs->last_live && !hs->owner_id) continue; send_state_host(fd, hs, i+1); } if (status) free(status); return; fail: send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (status) free(status); } static void cmd_renewal(int fd, struct sm_header *h_recv) { struct sm_header h; struct sanlk_lockspace lockspace; struct space *sp; uint32_t io_timeout = 0; struct renewal_history *history = NULL; struct renewal_history *hi; int history_size, history_prev, history_next; int i, rv, len; memset(&h, 0, sizeof(h)); memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(h); h.data = 0; if (!com.renewal_history_size) goto fail; len = sizeof(struct renewal_history) * com.renewal_history_size; history = malloc(len); if (!history) { h.data = -ENOMEM; goto fail; } rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { h.data = -ENOTCONN; goto fail; } pthread_mutex_lock(&spaces_mutex); sp = find_lockspace(lockspace.name); if (sp) { history_size = sp->renewal_history_size; history_prev = sp->renewal_history_prev; history_next = sp->renewal_history_next; io_timeout = sp->io_timeout; if (history_size != com.renewal_history_size) { log_error("mismatch history size"); history_size = 0; history_prev = 0; history_next = 0; } else { memcpy(history, sp->renewal_history, len); } } pthread_mutex_unlock(&spaces_mutex); if (!sp) { h.data = -ENOSPC; goto fail; } if (!history_size || (!history_prev && !history_next)) goto fail; h.data2 = io_timeout; send(fd, &h, sizeof(h), MSG_NOSIGNAL); /* If next slot is non-zero, then we've wrapped and should begin sending history from next to end before sending from 0 to prev. */ if (history[history_next].timestamp) { for (i = history_next; i < history_size; i++) { hi = &history[i]; send_state_renewal(fd, hi); } } for (i = 0; i < history_next; i++) { hi = &history[i]; send_state_renewal(fd, hi); } if (history) free(history); return; fail: send(fd, &h, sizeof(h), MSG_NOSIGNAL); if (history) free(history); } static char send_data_buf[LOG_DUMP_SIZE]; static void cmd_log_dump(int fd, struct sm_header *h_recv) { int len; copy_log_dump(send_data_buf, &len); h_recv->version = SM_PROTO; h_recv->data = len; send(fd, h_recv, sizeof(struct sm_header), MSG_NOSIGNAL); send(fd, send_data_buf, len, MSG_NOSIGNAL); } static void cmd_get_lockspaces(int ci, int fd, struct sm_header *h_recv, uint32_t cmd) { int count, len, rv; log_cmd(cmd, "cmd_get_lockspaces %d,%d", ci, fd); rv = get_lockspaces(send_data_buf, &len, &count, LOG_DUMP_SIZE); h_recv->version = SM_PROTO; h_recv->length = sizeof(struct sm_header) + len; h_recv->data = rv; h_recv->data2 = count; send(fd, h_recv, sizeof(struct sm_header), MSG_NOSIGNAL); send(fd, send_data_buf, len, MSG_NOSIGNAL); } static void cmd_get_hosts(int ci, int fd, struct sm_header *h_recv, uint32_t cmd) { struct sm_header h; struct sanlk_lockspace lockspace; int count = 0, len = 0, rv; log_cmd(cmd, "cmd_get_hosts %d,%d", ci, fd); memset(&h, 0, sizeof(h)); memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(h); h.data = 0; rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { h.data = -ENOTCONN; goto out; } rv = get_hosts(&lockspace, send_data_buf, &len, &count, LOG_DUMP_SIZE); h.length = sizeof(struct sm_header) + len; h.data = rv; h.data2 = count; out: send(fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); if (len) send(fd, send_data_buf, len, MSG_NOSIGNAL); } static void cmd_restrict(int ci, int fd, struct sm_header *h_recv, uint32_t cmd) { log_cmd(cmd, "cmd_restrict ci %d fd %d pid %d flags %x", ci, fd, client[ci].pid, h_recv->cmd_flags); client[ci].restricted = h_recv->cmd_flags; h_recv->version = SM_PROTO; send_result(ci, fd, h_recv, 0); } static void cmd_version(int ci GNUC_UNUSED, int fd, struct sm_header *h_recv) { h_recv->magic = SM_MAGIC; h_recv->version = SM_PROTO; h_recv->cmd = SM_CMD_VERSION; h_recv->cmd_flags = 0; h_recv->length = sizeof(struct sm_header); h_recv->seq = 0; h_recv->data = 0; h_recv->data2 = sanlock_version_combined; send(fd, h_recv, sizeof(struct sm_header), MSG_NOSIGNAL); } static void cmd_reg_event(int fd, struct sm_header *h_recv, uint32_t cmd) { struct sm_header h; struct sanlk_lockspace lockspace; struct sanlk_host_event he; int rv; memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(struct sm_header); rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { h.data = -ENOTCONN; goto out; } /* currently unused */ rv = recv(fd, &he, sizeof(he), MSG_WAITALL); if (rv != sizeof(he)) { h.data = -ENOTCONN; goto out; } rv = lockspace_reg_event(&lockspace, fd, h_recv->cmd_flags); h.data = rv; out: log_cmd(cmd, "cmd_reg_event fd %d rv %d", fd, rv); send(fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); } static void cmd_end_event(int fd, struct sm_header *h_recv, uint32_t cmd) { struct sm_header h; struct sanlk_lockspace lockspace; int rv; memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(struct sm_header); rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { h.data = -ENOTCONN; goto out; } rv = lockspace_end_event(&lockspace); h.data = rv; out: log_cmd(cmd, "cmd_end_event fd %d rv %d", fd, rv); send(fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); } static void cmd_set_config(int fd, struct sm_header *h_recv, uint32_t cmd) { struct sm_header h; struct sanlk_lockspace lockspace; int rv; memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(struct sm_header); rv = recv(fd, &lockspace, sizeof(struct sanlk_lockspace), MSG_WAITALL); if (rv != sizeof(struct sanlk_lockspace)) { h.data = -ENOTCONN; goto out; } rv = lockspace_set_config(&lockspace, h_recv->cmd_flags, h_recv->data); h.data = rv; out: log_cmd(cmd, "cmd_set_config fd %d rv %d", fd, rv); send(fd, &h, sizeof(struct sm_header), MSG_NOSIGNAL); } static int get_peer_pid(int fd, int *pid) { struct ucred cred; unsigned int len = sizeof(cred); if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &len) != 0) return -1; *pid = cred.pid; return 0; } void call_cmd_daemon(int ci, struct sm_header *h_recv, int client_maxi) { int rv, pid, auto_close = 1; int fd = client[ci].fd; uint32_t cmd = h_recv->cmd; switch (cmd) { case SM_CMD_REGISTER: rv = get_peer_pid(fd, &pid); if (rv < 0) { log_error("cmd_register ci %d fd %d get pid failed", ci, fd); break; } log_cmd(cmd, "cmd_register ci %d fd %d pid %d", ci, fd, pid); snprintf(client[ci].owner_name, SANLK_NAME_LEN, "%d", pid); client[ci].pid = pid; client[ci].deadfn = client_pid_dead; if (client[ci].tokens) { log_error("cmd_register ci %d fd %d tokens exist slots %d", ci, fd, client[ci].tokens_slots); free(client[ci].tokens); } client[ci].tokens_slots = SANLK_MAX_RESOURCES; client[ci].tokens = malloc(sizeof(struct token *) * SANLK_MAX_RESOURCES); if (!client[ci].tokens) { rv = -ENOMEM; log_error("cmd_register ci %d fd %d ENOMEM", ci, fd); break; } memset(client[ci].tokens, 0, sizeof(struct token *) * SANLK_MAX_RESOURCES); auto_close = 0; break; case SM_CMD_RESTRICT: cmd_restrict(ci, fd, h_recv, cmd); auto_close = 0; break; case SM_CMD_VERSION: cmd_version(ci, fd, h_recv); auto_close = 0; break; case SM_CMD_SHUTDOWN: strcpy(client[ci].owner_name, "shutdown"); if (h_recv->data) { /* force */ external_shutdown = 2; } else { pthread_mutex_lock(&spaces_mutex); if (list_empty(&spaces) && list_empty(&spaces_rem) && list_empty(&spaces_add)) external_shutdown = 1; else log_debug("ignore shutdown, lockspace exists"); pthread_mutex_unlock(&spaces_mutex); } break; case SM_CMD_STATUS: strcpy(client[ci].owner_name, "status"); cmd_status(ci, fd, h_recv, client_maxi, cmd); break; case SM_CMD_HOST_STATUS: strcpy(client[ci].owner_name, "host_status"); cmd_host_status(ci, fd, h_recv, cmd); break; case SM_CMD_RENEWAL: strcpy(client[ci].owner_name, "renewal"); cmd_renewal(fd, h_recv); break; case SM_CMD_LOG_DUMP: strcpy(client[ci].owner_name, "log_dump"); cmd_log_dump(fd, h_recv); break; case SM_CMD_GET_LOCKSPACES: strcpy(client[ci].owner_name, "get_lockspaces"); cmd_get_lockspaces(ci, fd, h_recv, cmd); break; case SM_CMD_GET_HOSTS: strcpy(client[ci].owner_name, "get_hosts"); cmd_get_hosts(ci, fd, h_recv, cmd); break; case SM_CMD_REG_EVENT: strcpy(client[ci].owner_name, "reg_event"); cmd_reg_event(fd, h_recv, cmd); break; case SM_CMD_END_EVENT: strcpy(client[ci].owner_name, "end_event"); cmd_end_event(fd, h_recv, cmd); break; case SM_CMD_SET_CONFIG: strcpy(client[ci].owner_name, "set_config"); cmd_set_config(fd, h_recv, cmd); break; }; /* * Previously just called close(fd) and did not set client[ci].fd = -1. * This meant that a new client ci could get this fd and use it. * * When a poll error occurs because this ci was finished, then * client_free(ci) would be called for this ci. client_free would * see cl->fd was still set and call close() on it, even though that * fd was now in use by another ci. * * We could probably get by with just doing this here: * client[ci].fd = -1; * close(fd); * * and then handling the full client_free in response to * the poll error (as done previously), but I see no reason * to avoid the full client_free here. */ if (auto_close) client_free(ci); } sanlock-3.8.2/src/cmd.h000066400000000000000000000013021371427612200146620ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __CMD_H__ #define __CMD_H__ struct cmd_args { struct list_head list; /* thread_pool data */ int ci_in; int ci_target; int cl_fd; int cl_pid; struct sm_header header; }; /* cmds processed by thread pool */ void call_cmd_thread(struct task *task, struct cmd_args *ca); /* cmds processed by main loop */ void call_cmd_daemon(int ci, struct sm_header *h_recv, int client_maxi); void daemon_shutdown_reply(void); #endif sanlock-3.8.2/src/crc32c.c000066400000000000000000000103441371427612200151770ustar00rootroot00000000000000/* * Copied from the btrfs-progs source code, which... * Copied from the kernel source code, lib/libcrc32c.c. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ #include #include /* * This is the CRC-32C table * Generated with: * width = 32 bits * poly = 0x1EDC6F41 * reflect input bytes = true * reflect output bytes = true */ static const uint32_t crc32c_table[256] = { 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL, 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L }; /* * Steps through buffer one byte at at time, calculates reflected * crc using table. */ uint32_t crc32c(uint32_t crc, uint8_t *data, size_t length); uint32_t crc32c(uint32_t crc, uint8_t *data, size_t length) { while (length--) crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8); return crc; } sanlock-3.8.2/src/delta_lease.c000066400000000000000000000665141371427612200163740ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock.h" #include "diskio.h" #include "ondisk.h" #include "direct.h" #include "log.h" #include "paxos_lease.h" #include "delta_lease.h" #include "timeouts.h" /* Based on "Light-Weight Leases for Storage-Centric Coordination" by Gregory Chockler and Dahlia Malkhi */ /* delta_leases are a series max_hosts leader_records, one leader per sector, host N's delta_lease is the leader_record in sectors N-1 */ /* * variable names: * rv: success is 0, failure is < 0 * error: success is 1 (SANLK_OK), failure is < 0 */ static void log_leader_error(int result, char *space_name, uint64_t host_id, struct sync_disk *disk, struct leader_record *lr, const char *caller) { log_error("leader1 %s error %d lockspace %.48s host_id %llu", caller ? caller : "unknown", result, space_name, (unsigned long long)host_id); log_error("leader2 path %s offset %llu", disk->path, (unsigned long long)disk->offset); log_error("leader3 m %x v %x ss %u nh %llu mh %llu oi %llu og %llu lv %llu", lr->magic, lr->version, lr->sector_size, (unsigned long long)lr->num_hosts, (unsigned long long)lr->max_hosts, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation, (unsigned long long)lr->lver); log_error("leader4 sn %.48s rn %.48s ts %llu cs %x", lr->space_name, lr->resource_name, (unsigned long long)lr->timestamp, lr->checksum); } static int verify_leader(struct sync_disk *disk, char *space_name, uint64_t host_id, struct leader_record *lr, uint32_t checksum, const char *caller) { int result; if (lr->magic != DELTA_DISK_MAGIC) { log_error("verify_leader %llu wrong magic %x %s", (unsigned long long)host_id, lr->magic, disk->path); result = SANLK_LEADER_MAGIC; goto fail; } if ((lr->version & 0xFFFF0000) != DELTA_DISK_VERSION_MAJOR) { log_error("verify_leader %llu wrong version %x %s", (unsigned long long)host_id, lr->version, disk->path); result = SANLK_LEADER_VERSION; goto fail; } if (strncmp(lr->space_name, space_name, NAME_ID_SIZE)) { log_error("verify_leader %llu wrong space name %.48s %.48s %s", (unsigned long long)host_id, lr->space_name, space_name, disk->path); result = SANLK_LEADER_LOCKSPACE; goto fail; } if (lr->checksum != checksum) { log_error("verify_leader %llu wrong checksum %x %x %s", (unsigned long long)host_id, lr->checksum, checksum, disk->path); result = SANLK_LEADER_CHECKSUM; goto fail; } return SANLK_OK; fail: log_leader_error(result, space_name, host_id, disk, lr, caller); /* struct leader_record leader_end; struct leader_record leader_rr; int rv; memset(&leader_end, 0, sizeof(leader_end)); rv = read_sectors(disk, lr->sector_size, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), NULL, "delta_verify"); leader_record_in(&leader_end, &leader_rr); log_leader_error(rv, space_name, host_id, disk, &leader_rr, "delta_verify"); */ return result; } /* read the lockspace name and io_timeout given the disk location */ int delta_read_lockspace(struct task *task, struct sync_disk *disk, int sector_size_hint, int align_size_hint, uint64_t host_id, struct sanlk_lockspace *ls, int io_timeout, int *io_timeout_ret) { struct leader_record leader_end; struct leader_record leader; uint32_t checksum; char *space_name; int align_size; int rv, error; /* host_id N is block offset N-1 */ memset(&leader_end, 0, sizeof(struct leader_record)); /* * All we need to read is the leader_record which is returned whether * or not the sector_size_hint is wrong or not. */ rv = read_sectors(disk, sector_size_hint, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), task, io_timeout, "read_lockspace"); if (rv < 0) return rv; /* N.B. compute checksum before byte swapping */ checksum = leader_checksum(&leader_end); leader_record_in(&leader_end, &leader); if (!ls->name[0]) space_name = leader.space_name; else space_name = ls->name; error = verify_leader(disk, space_name, host_id, &leader, checksum, "read_lockspace"); if (error == SANLK_OK) { memcpy(ls->name, leader.space_name, SANLK_NAME_LEN); ls->host_id = host_id; *io_timeout_ret = leader.io_timeout; align_size = leader_align_size_from_flag(leader.flags); if (!align_size) align_size = sector_size_to_align_size_old(leader.sector_size); /* The flags set by the user may not have been correct. */ sanlk_lsf_sector_flags_clear(&ls->flags); sanlk_lsf_align_flags_clear(&ls->flags); ls->flags |= sanlk_lsf_sector_size_to_flag(leader.sector_size); ls->flags |= sanlk_lsf_align_size_to_flag(align_size); } return error; } int delta_read_lockspace_sizes(struct task *task, struct sync_disk *disk, int io_timeout, int *sector_size, int *align_size) { struct leader_record leader_end; struct leader_record leader; int rv; memset(&leader_end, 0, sizeof(struct leader_record)); /* * read the first 4k, which either includes one 4k delta lease or 8 512b * delta leases. In either case, we only look at the initial leader * record to get to the sector size. */ rv = read_sectors(disk, 4096, 0, 1, (char *)&leader_end, sizeof(struct leader_record), task, io_timeout, "read_lockspace_sector_size"); if (rv < 0) return rv; leader_record_in(&leader_end, &leader); if (leader.magic != DELTA_DISK_MAGIC) return SANLK_LEADER_MAGIC; if ((leader.version & 0xFFFF0000) != DELTA_DISK_VERSION_MAJOR) return SANLK_LEADER_VERSION; *sector_size = leader.sector_size; *align_size = leader_align_size_from_flag(leader.flags); if (!*align_size) *align_size = sector_size_to_align_size_old(leader.sector_size); return SANLK_OK; } int delta_lease_leader_read(struct task *task, int sector_size, int io_timeout, struct sync_disk *disk, char *space_name, uint64_t host_id, struct leader_record *leader_ret, const char *caller) { struct leader_record leader_end; struct leader_record leader; uint32_t checksum; int rv, error; if (!sector_size) { log_error("delta_lease_leader_read with zero sector_size %s", space_name); return -EINVAL; } /* host_id N is block offset N-1 */ memset(&leader_end, 0, sizeof(struct leader_record)); memset(leader_ret, 0, sizeof(struct leader_record)); rv = read_sectors(disk, sector_size, host_id - 1, 1, (char *)&leader_end, sizeof(struct leader_record), task, io_timeout, "delta_leader"); if (rv < 0) return rv; /* N.B. compute checksum before byte swapping */ checksum = leader_checksum(&leader_end); leader_record_in(&leader_end, &leader); error = verify_leader(disk, space_name, host_id, &leader, checksum, caller); memcpy(leader_ret, &leader, sizeof(struct leader_record)); return error; } /* * NB. this should not be used to write the leader record, it is meant only * for manually clobbering the disk to corrupt it for testing, or to manually * repair it after it's corrupted. */ int delta_lease_leader_clobber(struct task *task, int io_timeout, struct sync_disk *disk, uint64_t host_id, struct leader_record *leader, const char *caller) { struct leader_record leader_end; int rv; leader_record_out(leader, &leader_end); rv = write_sector(disk, leader->sector_size, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), task, io_timeout, caller); if (rv < 0) return rv; return SANLK_OK; } /* * delta_lease_acquire: * set the owner of host_id to our_host_name. * * paxos_lease_acquire: * set the owner of resource_name to host_id. * * our_host_name is a unique host identifier used to detect when two different * hosts are trying to acquire the same host_id (since both will be using the * same host_id, that host_id won't work to distinguish between them.) We copy * our_host_name into leader.resource_name, so in a sense the owner_id and * resource_name fields of the leader_record switch functions: the common * resource is the ower_id, and the distinguishing id is the resource_name. */ int delta_lease_acquire(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *our_host_name, uint64_t host_id, struct leader_record *leader_ret) { struct leader_record leader; struct leader_record leader1; struct leader_record leader_end; uint64_t new_ts; uint32_t checksum; int other_io_timeout, other_host_dead_seconds, other_id_renewal_seconds; int i, error, rv, delay, delta_large_delay; log_space(sp, "delta_acquire begin %.48s:%llu", sp->space_name, (unsigned long long)host_id); error = delta_lease_leader_read(task, sp->sector_size, sp->io_timeout, disk, space_name, host_id, &leader, "delta_acquire_begin"); if (error < 0) { log_space(sp, "delta_acquire leader_read1 error %d", error); return error; } other_io_timeout = leader.io_timeout; if (!other_io_timeout) { log_erros(sp, "delta_acquire use own io_timeout %d", sp->io_timeout); other_io_timeout = sp->io_timeout; } else if (other_io_timeout != sp->io_timeout) { log_erros(sp, "delta_acquire other_io_timeout %u our %u", leader.io_timeout, sp->io_timeout); } if (leader.timestamp == LEASE_FREE) goto write_new; if (!strncmp(leader.resource_name, our_host_name, NAME_ID_SIZE)) { log_space(sp, "delta_acquire fast reacquire"); goto write_new; } /* we need to ensure that a host_id cannot be acquired and released * sooner than host_dead_seconds because the change in host_id * ownership affects the host_id "liveness" determination used by paxos * leases, and the ownership of paxos leases cannot change until after * host_dead_seconds to ensure that the watchdog has fired. So, I * think we want the delay here to be the max of host_dead_seconds and * the D+6d delay. * * Per the algorithm in the paper, a delta lease can change ownership * in the while loop below after the delta_delay of D+6d. However, * because we use the change of delta lease ownership to directly * determine the change in paxos lease ownership, we need the delta * delay to also meet the delay requirements of the paxos leases. The * paxos leases cannot change ownership until a min of * host_dead_seconds to ensure the watchdog has fired. So, the timeout * we use here must be the max of the delta delay (D+6d) and * host_dead_seconds */ /* * delay = task->host_dead_seconds; * delta_large_delay = task->id_renewal_seconds + (6 * task->io_timeout_seconds); * if (delta_large_delay > delay) * delay = delta_large_delay; */ other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout); other_id_renewal_seconds = calc_id_renewal_seconds(other_io_timeout); delay = other_host_dead_seconds; delta_large_delay = other_id_renewal_seconds + (6 * other_io_timeout); if (delta_large_delay > delay) delay = delta_large_delay; while (1) { memcpy(&leader1, &leader, sizeof(struct leader_record)); log_space(sp, "delta_acquire delta_large_delay %d delay %d", delta_large_delay, delay); /* TODO: we could reread every several seconds to see if it has changed, so we can abort more quickly if so */ for (i = 0; i < delay; i++) { if (sp->external_remove || external_shutdown) { log_space(sp, "delta_acquire abort1 remove %d shutdown %d", sp->external_remove, external_shutdown); return SANLK_ERROR; } sleep(1); } error = delta_lease_leader_read(task, sp->sector_size, sp->io_timeout, disk, space_name, host_id, &leader, "delta_acquire_wait"); if (error < 0) { log_space(sp, "delta_acquire leader_read2 error %d", error); return error; } if (!memcmp(&leader1, &leader, sizeof(struct leader_record))) break; if (leader.timestamp == LEASE_FREE) break; log_erros(sp, "delta_acquire host_id %llu busy1 %llu %llu %llu %.48s", (unsigned long long)host_id, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, leader.resource_name); return SANLK_HOSTID_BUSY; } write_new: new_ts = monotime(); leader.timestamp = new_ts; leader.io_timeout = (sp->io_timeout & 0x00FF); leader.owner_id = host_id; leader.owner_generation++; snprintf(leader.resource_name, NAME_ID_SIZE, "%s", our_host_name); leader.checksum = 0; /* set below */ log_space(sp, "delta_acquire write %llu %llu %llu %.48s", (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, leader.resource_name); leader_record_out(&leader, &leader_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&leader_end); leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); rv = write_sector(disk, sp->sector_size, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), task, sp->io_timeout, "delta_leader"); if (rv < 0) { log_space(sp, "delta_acquire write error %d", rv); return rv; } memcpy(&leader1, &leader, sizeof(struct leader_record)); delay = 2 * other_io_timeout; log_space(sp, "delta_acquire delta_short_delay %d", delay); for (i = 0; i < delay; i++) { if (sp->external_remove || external_shutdown) { log_space(sp, "delta_acquire abort2 remove %d shutdown %d", sp->external_remove, external_shutdown); return SANLK_ERROR; } sleep(1); } error = delta_lease_leader_read(task, sp->sector_size, sp->io_timeout, disk, space_name, host_id, &leader, "delta_acquire_check"); if (error < 0) { log_space(sp, "delta_acquire leader_read3 error %d", error); return error; } if (memcmp(&leader1, &leader, sizeof(struct leader_record))) { log_erros(sp, "delta_acquire host_id %llu busy2 %llu %llu %llu %.48s", (unsigned long long)host_id, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, leader.resource_name); return SANLK_HOSTID_BUSY; } log_space(sp, "delta_acquire done %llu %llu %llu", (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp); memcpy(leader_ret, &leader, sizeof(struct leader_record)); return SANLK_OK; } int delta_lease_renew(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *bitmap, struct delta_extra *extra, int prev_result, int *read_result, int log_renewal_level, struct leader_record *leader_last, struct leader_record *leader_ret, int *rd_ms, int *wr_ms) { struct leader_record leader; struct leader_record leader_end; char **p_iobuf; char **p_wbuf; char *wbuf; struct timespec begin, end, diff; uint32_t checksum; uint32_t reap_timeout_msec; uint64_t host_id, id_offset, new_ts, now; int rv, iobuf_len, sector_size; if (!leader_last) { log_erros(sp, "delta_renew no leader_last"); return -EINVAL; } *rd_ms = -1; *wr_ms = -1; *read_result = SANLK_ERROR; host_id = leader_last->owner_id; iobuf_len = sp->align_size; sector_size = sp->sector_size; /* offset of our leader_record */ id_offset = (host_id - 1) * sector_size; if (id_offset > iobuf_len) { log_erros(sp, "delta_renew bad offset %llu iobuf_len %d", (unsigned long long)id_offset, iobuf_len); return -EINVAL; } /* if the previous renew timed out in this initial read, and that read is now complete, we can use that result here instead of discarding it and doing another. */ if (prev_result == SANLK_AIO_TIMEOUT) { if (!task->read_iobuf_timeout_aicb) { /* shouldn't happen, when do_linux_aio returned AIO_TIMEOUT it should have set read_iobuf_timeout_aicb */ log_erros(sp, "delta_renew reap no aicb"); goto skip_reap; } if (!task->iobuf) { /* shouldn't happen */ log_erros(sp, "delta_renew reap no iobuf"); goto skip_reap; } log_space(sp, "delta_renew begin reap"); if (!sp->renewal_read_extend_sec) { /* only wait .5 sec when trying to reap a prev io to clear it */ reap_timeout_msec = 500; } else { /* effectively continue/extend the read phase from the previous renewal */ reap_timeout_msec = sp->renewal_read_extend_sec * 1000; } clock_gettime(CLOCK_MONOTONIC_RAW, &begin); rv = read_iobuf_reap(disk->fd, disk->offset, task->iobuf, iobuf_len, task, reap_timeout_msec); log_space(sp, "delta_renew reap %d", rv); if (!rv) { /* read time for this renewal is the io_timeout length for the previous read plus the time spent in reap. */ clock_gettime(CLOCK_MONOTONIC_RAW, &end); ts_diff(&begin, &end, &diff); *rd_ms = (diff.tv_sec * 1000) + (diff.tv_nsec / 1000000) + (sp->io_timeout * 1000); task->read_iobuf_timeout_aicb = NULL; goto read_done; } skip_reap: /* abandon the previous timed out read and try a new one from scratch. the current task->iobuf mem will freed when timeout_aicb completes sometime */ task->read_iobuf_timeout_aicb = NULL; task->iobuf = NULL; } if (task->read_iobuf_timeout_aicb) { /* this could happen get here if there was another read between renewal reads, which timed out and caused read_iobuf_timeout_aicb to be set; I don't think there are any cases where that would happen, though. we could avoid this confusion by passing back the timed out aicb along with SANLK_AIO_TIMEOUT, and only save the timed out aicb when we want to try to reap it later. */ log_space(sp, "delta_renew timeout_aicb is unexpectedly %p iobuf %p", task->read_iobuf_timeout_aicb, task->iobuf); task->read_iobuf_timeout_aicb = NULL; task->iobuf = NULL; } /* * NB. this task->iobuf is also copied by the lockspace thread * into renewal_read_buf, which is then copied in the main loop * by check_our_lease and passed to check_other_leases. */ if (!task->iobuf) { /* this will happen the first time renew is called, and after a timed out renewal read fails to be reaped (see task->iobuf = NULL above) */ p_iobuf = &task->iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) { log_erros(sp, "dela_renew memalign rv %d", rv); rv = -ENOMEM; } } if (log_renewal_level != -1) log_level(sp->space_id, 0, NULL, log_renewal_level, "delta_renew begin read"); rv = read_iobuf(disk->fd, disk->offset, task->iobuf, iobuf_len, task, sp->io_timeout, rd_ms); if (rv) { /* the next time delta_lease_renew() is called, prev_result will be this rv. If this rv is SANLK_AIO_TIMEOUT, we'll try to reap the event */ if (rv == SANLK_AIO_TIMEOUT) log_erros(sp, "delta_renew read timeout %u sec offset %llu %s", sp->io_timeout, (unsigned long long)disk->offset, disk->path); else log_erros(sp, "delta_renew read rv %d offset %llu %s", rv, (unsigned long long)disk->offset, disk->path); return rv; } read_done: *read_result = SANLK_OK; memcpy(&leader_end, task->iobuf+id_offset, sizeof(struct leader_record)); /* N.B. compute checksum before byte swapping */ checksum = leader_checksum(&leader_end); leader_record_in(&leader_end, &leader); rv = verify_leader(disk, space_name, host_id, &leader, checksum, "delta_renew"); if (rv < 0) { log_erros(sp, "delta_renew verify_leader error %d", rv); return rv; } /* We can't always memcmp(&leader, leader_last) because previous writes may have timed out and we don't know if they were actually written or not. We can definately verify that we're still the owner, though, which is the main thing we need to know. */ if (leader.owner_id != leader_last->owner_id || leader.owner_generation != leader_last->owner_generation || memcmp(leader.resource_name, leader_last->resource_name, NAME_ID_SIZE)) { log_erros(sp, "delta_renew not owner"); log_leader_error(0, space_name, host_id, disk, leader_last, "delta_renew_last"); log_leader_error(0, space_name, host_id, disk, &leader, "delta_renew_read"); return SANLK_RENEW_OWNER; } if (prev_result == SANLK_OK && memcmp(&leader, leader_last, sizeof(struct leader_record))) { log_erros(sp, "delta_renew reread mismatch"); log_leader_error(0, space_name, host_id, disk, leader_last, "delta_renew_last"); log_leader_error(0, space_name, host_id, disk, &leader, "delta_renew_read"); return SANLK_RENEW_DIFF; } if (leader.io_timeout != sp->io_timeout) { log_erros(sp, "delta_renew io_timeout changed disk %d sp %d", leader.io_timeout, sp->io_timeout); leader.io_timeout = (sp->io_timeout & 0x00FF); } new_ts = monotime(); if (log_renewal_level != -1) log_level(sp->space_id, 0, NULL, log_renewal_level, "delta_renew begin write for new ts %llu", (unsigned long long)new_ts); if (leader.timestamp >= new_ts) log_erros(sp, "delta_renew timestamp too small"); leader.timestamp = new_ts; leader.checksum = 0; /* set below */ /* TODO: rename the leader fields */ if (extra) { leader.write_id = extra->field1; leader.write_generation = extra->field2; leader.write_timestamp = extra->field3; } p_wbuf = &wbuf; rv = posix_memalign((void *)p_wbuf, getpagesize(), sector_size); if (rv) { log_erros(sp, "dela_renew write memalign rv %d", rv); return -ENOMEM; } memset(wbuf, 0, sector_size); leader_record_out(&leader, &leader_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&leader_end); leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); memcpy(wbuf, &leader_end, sizeof(struct leader_record)); memcpy(wbuf+LEADER_RECORD_MAX, bitmap, HOSTID_BITMAP_SIZE); /* extend io timeout for this one write; we need to give this write every chance to succeed, and there's no point in letting it time out. there's nothing we would do but retry it, and timing out and retrying unnecessarily would probably be counter productive. */ rv = write_iobuf(disk->fd, disk->offset+id_offset, wbuf, sector_size, task, calc_host_dead_seconds(sp->io_timeout), wr_ms); if (rv != SANLK_AIO_TIMEOUT) free(wbuf); now = monotime(); if (rv < 0) { log_erros(sp, "delta_renew write time %llu error %d", (unsigned long long)(now - new_ts), rv); return rv; } if (now - new_ts >= sp->io_timeout) log_erros(sp, "delta_renew long write time %llu sec", (unsigned long long)(now - new_ts)); /* the paper shows doing a delay and another read here, but it seems unnecessary since we do the same at the beginning of the next renewal */ memcpy(leader_ret, &leader, sizeof(struct leader_record)); return SANLK_OK; } int delta_lease_release(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name GNUC_UNUSED, struct leader_record *leader_last, struct leader_record *leader_ret) { struct leader_record leader; struct leader_record leader_end; uint64_t host_id; uint32_t checksum; int rv; if (!leader_last) return -EINVAL; host_id = leader_last->owner_id; log_space(sp, "delta_release begin %.48s:%llu", sp->space_name, (unsigned long long)host_id); memcpy(&leader, leader_last, sizeof(struct leader_record)); leader.timestamp = LEASE_FREE; leader.checksum = 0; /* set below */ leader_record_out(&leader, &leader_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&leader_end); leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); rv = write_sector(disk, sp->sector_size, host_id - 1, (char *)&leader_end, sizeof(struct leader_record), task, sp->io_timeout, "delta_leader"); if (rv < 0) { log_space(sp, "delta_release write error %d", rv); return rv; } log_space(sp, "delta_release done %llu %llu %llu", (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp); memcpy(leader_ret, &leader, sizeof(struct leader_record)); return SANLK_OK; } /* the host_id lease area begins disk->offset bytes from the start of block device disk->path */ int delta_lease_init(struct task *task, struct sanlk_lockspace *ls, int io_timeout, struct sync_disk *disk) { struct leader_record leader_first; struct leader_record leader_end; struct leader_record leader; char *iobuf, **p_iobuf; int iobuf_len; int sector_size = 0; int align_size = 0; int max_hosts = 0; int write_io_timeout; int i, rv; uint32_t checksum; if (!io_timeout) io_timeout = DEFAULT_IO_TIMEOUT; rv = sizes_from_flags(ls->flags, §or_size, &align_size, &max_hosts, "LSF"); if (rv) return rv; if (!sector_size) { /* sector/align flags were not set, use historical defaults */ sector_size = disk->sector_size; align_size = sector_size_to_align_size_old(sector_size); max_hosts = DEFAULT_MAX_HOSTS; } iobuf_len = align_size; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(iobuf, 0, iobuf_len); /* host_id N is block offset N-1 */ for (i = 0; i < max_hosts; i++) { memset(&leader, 0, sizeof(struct leader_record)); leader.magic = DELTA_DISK_MAGIC; leader.version = DELTA_DISK_VERSION_MAJOR | DELTA_DISK_VERSION_MINOR; leader.flags = leader_align_flag_from_size(align_size); leader.sector_size = sector_size; leader.max_hosts = 1; leader.timestamp = LEASE_FREE; leader.io_timeout = io_timeout; strncpy(leader.space_name, ls->name, NAME_ID_SIZE); leader.checksum = 0; /* set below */ /* make the first record invalid so we can do a single atomic write below to commit the whole thing */ if (!i) { leader.magic = 0; memcpy(&leader_first, &leader, sizeof(struct leader_record)); } leader_record_out(&leader, &leader_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&leader_end); leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); memcpy(iobuf + (i * sector_size), &leader_end, sizeof(struct leader_record)); } /* * The io_timeout arg is a part of the lockspace logic, and * determines how the lockspace times out. The process of * initializing the lease on disk can to use a longer timeout * than the algorithm uses. */ if (com.write_init_io_timeout) write_io_timeout = com.write_init_io_timeout; else write_io_timeout = io_timeout; rv = write_iobuf(disk->fd, disk->offset, iobuf, iobuf_len, task, write_io_timeout, NULL); if (rv < 0) goto out; /* commit the whole lockspace by making the first record valid */ leader_first.magic = DELTA_DISK_MAGIC; leader_first.checksum = 0; /* set below */ leader_record_out(&leader_first, &leader_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&leader_end); leader_first.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); memcpy(iobuf, &leader_end, sizeof(struct leader_record)); rv = write_iobuf(disk->fd, disk->offset, iobuf, sector_size, task, write_io_timeout, NULL); out: if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } sanlock-3.8.2/src/delta_lease.h000066400000000000000000000047611371427612200163750ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DELTA_LEASE_H__ #define __DELTA_LEASE_H__ int delta_lease_leader_read(struct task *task, int sector_size, int io_timeout, struct sync_disk *disk, char *space_name, uint64_t host_id, struct leader_record *leader_ret, const char *caller); int delta_lease_acquire(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *our_host_name, uint64_t host_id, struct leader_record *leader_ret); int delta_lease_renew(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name, char *bitmap, struct delta_extra *extra, int prev_result, int *read_result, int log_renewal_level, struct leader_record *leader_last, struct leader_record *leader_ret, int *rd_ms, int *wr_ms); int delta_lease_release(struct task *task, struct space *sp, struct sync_disk *disk, char *space_name GNUC_UNUSED, struct leader_record *leader_last, struct leader_record *leader_ret); int delta_lease_init(struct task *task, struct sanlk_lockspace *ls, int io_timeout, struct sync_disk *disk); int delta_read_lockspace(struct task *task, struct sync_disk *disk, int sector_size_hint, int align_size_hint, uint64_t host_id, struct sanlk_lockspace *ls, int io_timeout, int *io_timeout_ret); int delta_read_lockspace_sizes(struct task *task, struct sync_disk *disk, int io_timeout, int *sector_size, int *align_size); int delta_lease_leader_clobber(struct task *task, int io_timeout, struct sync_disk *disk, uint64_t host_id, struct leader_record *leader, const char *caller); #endif sanlock-3.8.2/src/direct.c000066400000000000000000000565171371427612200154060ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock_admin.h" #include "diskio.h" #include "ondisk.h" #include "log.h" #include "resource.h" #include "direct.h" #include "paxos_lease.h" #include "delta_lease.h" #include "timeouts.h" #include "rindex.h" /* * the caller sets sd.offset to the location from the start of disk (in bytes) where * a data struct should be read and checked for sector/align sizes. */ static int direct_read_leader_sizes(struct task *task, struct sync_disk *sd, int *sector_size, int *align_size) { struct leader_record *lr_end; struct leader_record lr_in; char *data; int datalen; int rv; datalen = 4096; data = malloc(datalen); if (!data) return -ENOMEM; memset(data, 0, datalen); rv = read_sectors(sd, 4096, 0, 1, data, datalen, task, DEFAULT_IO_TIMEOUT, "read_sector_size"); if (rv < 0) { free(data); return rv; } lr_end = (struct leader_record *)data; leader_record_in(lr_end, &lr_in); free(data); if ((lr_in.magic == DELTA_DISK_MAGIC) || (lr_in.magic == PAXOS_DISK_MAGIC)) { *sector_size = lr_in.sector_size; *align_size = leader_align_size_from_flag(lr_in.flags); if (!*align_size) *align_size = sector_size_to_align_size_old(*sector_size); return 0; } return -1; } /* * cli: sanlock direct init * cli: sanlock direct read_leader * cli: sanlock direct acquire * cli: sanlock direct release * lib: sanlock_direct_init() * * direct.c: * direct_init() * direct_read_leader() * direct_acquire() * direct_release() * do_paxos_action() * paxos_lease.c: * paxos_lease_init() * paxos_lease_leader_read() * paxos_lease_acquire() * paxos_lease_release() * * cli: sanlock direct init * cli: sanlock direct read_leader * cli: sanlock direct acquire_id * cli: sanlock direct release_id * cli: sanlock direct renew_id * lib: sanlock_direct_init() * * direct.c: * direct_init() * direct_read_leader() * direct_acquire_id() * direct_release_id() * direct_renew_id() * do_delta_action() * delta_lease.c: * delta_lease_init() * delta_lease_leader_read() * delta_lease_acquire() * delta_lease_release() * delta_lease_renew() */ static int do_paxos_action(int action, struct task *task, int io_timeout, struct sanlk_resource *res, int num_hosts, int write_clear, uint64_t local_host_id, uint64_t local_host_generation, struct leader_record *leader_in, struct leader_record *leader_ret) { struct token *token; struct leader_record leader; struct paxos_dblock dblock; int sector_size = 0; int align_size = 0; int max_hosts = 0; int disks_len, token_len; int j, rv = 0; if (!io_timeout) io_timeout = DEFAULT_IO_TIMEOUT; rv = sizes_from_flags(res->flags, §or_size, &align_size, &max_hosts, "RES"); if (rv) return -1; disks_len = res->num_disks * sizeof(struct sync_disk); token_len = sizeof(struct token) + disks_len; token = malloc(token_len); if (!token) return -ENOMEM; memset(token, 0, token_len); token->io_timeout = io_timeout; token->disks = (struct sync_disk *)&token->r.disks[0]; token->r.num_disks = res->num_disks; memcpy(token->r.lockspace_name, res->lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res->name, SANLK_NAME_LEN); token->r.flags = res->flags; /* WARNING sync_disk == sanlk_disk */ memcpy(token->disks, &res->disks, disks_len); for (j = 0; j < token->r.num_disks; j++) { token->disks[j].sector_size = 0; token->disks[j].fd = -1; } rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { free(token); return rv; } if (!sector_size && com.sector_size) sector_size = com.sector_size; if (!align_size && com.align_size) align_size = com.align_size; switch (action) { case ACT_DIRECT_INIT: /* paxos_lease_init looks at token->r.flags for sector/align flags */ rv = paxos_lease_init(task, token, num_hosts, write_clear); break; case ACT_ACQUIRE: if (!sector_size || !align_size) { rv = direct_read_leader_sizes(task, &token->disks[0], §or_size, &align_size); if (rv < 0) break; } token->sector_size = sector_size; token->align_size = align_size; token->host_id = local_host_id; token->host_generation = local_host_generation; rv = paxos_lease_acquire(task, token, 0, leader_ret, &dblock, 0, 0); break; case ACT_RELEASE: if (!sector_size) sector_size = 4096; if (!align_size) align_size = sector_size_to_align_size_old(sector_size); token->sector_size = sector_size; token->align_size = align_size; rv = paxos_lease_leader_read(task, token, &leader, "direct_release"); if (rv < 0) break; sector_size = leader.sector_size; align_size = leader_align_size_from_flag(leader.flags); if (!align_size) align_size = sector_size_to_align_size_old(sector_size); token->sector_size = sector_size; token->align_size = align_size; rv = paxos_lease_release(task, token, NULL, &leader, leader_ret); break; case ACT_READ_LEADER: if (!sector_size) sector_size = 4096; if (!align_size) align_size = sector_size_to_align_size_old(sector_size); token->sector_size = sector_size; token->align_size = align_size; rv = paxos_lease_leader_read(task, token, &leader, "direct_read_leader"); break; case ACT_WRITE_LEADER: sector_size = leader_in->sector_size; align_size = leader_align_size_from_flag(leader_in->flags); if (!align_size) align_size = sector_size_to_align_size_old(sector_size); token->sector_size = sector_size; token->align_size = align_size; rv = paxos_lease_leader_clobber(task, token, leader_in, "direct_clobber"); break; } close_disks(token->disks, token->r.num_disks); free(token); if (rv == SANLK_OK) rv = 0; if (leader_ret) memcpy(leader_ret, &leader, sizeof(struct leader_record)); return rv; } /* * sanlock direct acquire -i -g -r RESOURCE * sanlock direct release -r RESOURCE */ int direct_acquire(struct task *task, int io_timeout, struct sanlk_resource *res, int num_hosts, uint64_t local_host_id, uint64_t local_host_generation, struct leader_record *leader_ret) { return do_paxos_action(ACT_ACQUIRE, task, io_timeout, res, num_hosts, 0, local_host_id, local_host_generation, NULL, leader_ret); } int direct_release(struct task *task, int io_timeout, struct sanlk_resource *res, struct leader_record *leader_ret) { return do_paxos_action(ACT_RELEASE, task, io_timeout, res, 0, 0, 0, 0, NULL, leader_ret); } static int do_delta_action(int action, struct task *task, int io_timeout, struct sanlk_lockspace *ls, char *our_host_name, struct leader_record *leader_in, struct leader_record *leader_ret) { struct leader_record leader; struct sync_disk sd; struct space space; char bitmap[HOSTID_BITMAP_SIZE]; int sector_size = 0; int align_size = 0; int max_hosts = 0; int read_result; int rd_ms, wr_ms; int rv; memset(bitmap, 0, sizeof(bitmap)); if (!io_timeout) io_timeout = DEFAULT_IO_TIMEOUT; rv = sizes_from_flags(ls->flags, §or_size, &align_size, &max_hosts, "LSF"); if (rv) return -1; memset(&leader, 0, sizeof(leader)); /* for log_space in delta functions */ memset(&space, 0, sizeof(space)); space.io_timeout = io_timeout; if (!ls->host_id_disk.path[0]) return -ENODEV; if ((action != ACT_DIRECT_INIT) && !ls->host_id) return -EINVAL; memset(&sd, 0, sizeof(struct sync_disk)); memcpy(&sd, &ls->host_id_disk, sizeof(struct sanlk_disk)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) return -ENODEV; if (!sector_size && com.sector_size) sector_size = com.sector_size; if (!align_size && com.align_size) align_size = com.align_size; switch (action) { case ACT_DIRECT_INIT: /* delta_lease_init looks at ls->flags for sector/align sizes */ rv = delta_lease_init(task, ls, io_timeout, &sd); break; case ACT_ACQUIRE_ID: if (!sector_size || !align_size) { rv = direct_read_leader_sizes(task, &sd, §or_size, &align_size); if (rv < 0) break; } space.sector_size = sector_size; space.align_size = align_size; rv = delta_lease_acquire(task, &space, &sd, ls->name, our_host_name, ls->host_id, &leader); break; case ACT_RENEW_ID: if (!sector_size || !align_size) { rv = direct_read_leader_sizes(task, &sd, §or_size, &align_size); if (rv < 0) break; } space.sector_size = sector_size; space.align_size = align_size; rv = delta_lease_leader_read(task, sector_size, io_timeout, &sd, ls->name, ls->host_id, &leader, "direct_renew"); if (rv < 0) return rv; rv = delta_lease_renew(task, &space, &sd, ls->name, bitmap, NULL, -1, &read_result, 0, &leader, &leader, &rd_ms, &wr_ms); break; case ACT_RELEASE_ID: if (!sector_size || !align_size) { rv = direct_read_leader_sizes(task, &sd, §or_size, &align_size); if (rv < 0) break; } space.sector_size = sector_size; space.align_size = align_size; rv = delta_lease_leader_read(task, sector_size, io_timeout, &sd, ls->name, ls->host_id, &leader, "direct_release"); if (rv < 0) return rv; rv = delta_lease_release(task, &space, &sd, ls->name, &leader, &leader); break; case ACT_READ_LEADER: if (!sector_size || !align_size) { rv = direct_read_leader_sizes(task, &sd, §or_size, &align_size); if (rv < 0) break; } rv = delta_lease_leader_read(task, sector_size, io_timeout, &sd, ls->name, ls->host_id, &leader, "direct_read"); break; case ACT_WRITE_LEADER: rv = delta_lease_leader_clobber(task, io_timeout, &sd, ls->host_id, leader_in, "direct_clobber"); } close_disks(&sd, 1); if (rv == SANLK_OK) rv = 0; if (leader_ret) memcpy(leader_ret, &leader, sizeof(struct leader_record)); return rv; } /* * sanlock direct acquire_id|release_id|renew_id -s LOCKSPACE * * should be the equivalent of what the daemon would do for * sanlock client add_lockspace|rem_lockspace -s LOCKSPACE */ int direct_acquire_id(struct task *task, int io_timeout, struct sanlk_lockspace *ls, char *our_host_name) { return do_delta_action(ACT_ACQUIRE_ID, task, io_timeout, ls, our_host_name, NULL, NULL); } int direct_release_id(struct task *task, int io_timeout, struct sanlk_lockspace *ls) { return do_delta_action(ACT_RELEASE_ID, task, io_timeout, ls, NULL, NULL, NULL); } int direct_renew_id(struct task *task, int io_timeout, struct sanlk_lockspace *ls) { return do_delta_action(ACT_RENEW_ID, task, io_timeout, ls, NULL, NULL, NULL); } int direct_align(struct sync_disk *disk) { if (disk->sector_size == 512) return 1024 * 1024; else if (disk->sector_size == 4096) return 8 * 1024 * 1024; else return -EINVAL; } /* io_timeout is written to leader record and used for the write call itself */ int direct_write_lockspace(struct task *task, struct sanlk_lockspace *ls, uint32_t io_timeout) { if (!ls) return -1; return do_delta_action(ACT_DIRECT_INIT, task, io_timeout, ls, NULL, NULL, NULL); } int direct_write_resource(struct task *task, struct sanlk_resource *res, int num_hosts, int write_clear) { if (!res) return -1; if (!res->num_disks) return -ENODEV; if (!res->disks[0].path[0]) return -ENODEV; return do_paxos_action(ACT_DIRECT_INIT, task, 0, res, num_hosts, write_clear, 0, 0, NULL, NULL); } int direct_read_leader(struct task *task, int io_timeout, struct sanlk_lockspace *ls, struct sanlk_resource *res, struct leader_record *leader_ret) { int rv = -1; if (ls && ls->host_id_disk.path[0]) rv = do_delta_action(ACT_READ_LEADER, task, io_timeout, ls, NULL, NULL, leader_ret); else if (res) rv = do_paxos_action(ACT_READ_LEADER, task, io_timeout, res, 0, 0, 0, 0, NULL, leader_ret); return rv; } int direct_write_leader(struct task *task, int io_timeout, struct sanlk_lockspace *ls, struct sanlk_resource *res, struct leader_record *leader) { int rv = -1; if (ls && ls->host_id_disk.path[0]) { rv = do_delta_action(ACT_WRITE_LEADER, task, io_timeout, ls, NULL, leader, NULL); } else if (res) { rv = do_paxos_action(ACT_WRITE_LEADER, task, io_timeout, res, 0, 0, 0, 0, leader, NULL); } return rv; } int test_id_bit(int host_id, char *bitmap); int direct_dump(struct task *task, char *dump_path, int force_mode) { char *data, *bitmap; char *colon1 = NULL, *colon2 = NULL, *off_str = NULL, *size_str = NULL, *m; uint32_t magic; struct rindex_header *rh_end; struct rindex_header *rh; struct rindex_header rh_in; struct rindex_entry *re_end; struct rindex_entry *re; struct rindex_entry re_in; struct leader_record *lr_end; struct leader_record *lr; struct leader_record lr_in; struct request_record rr; struct mode_block mb; struct sync_disk sd; struct paxos_dblock dblock; char sname[NAME_ID_SIZE+1]; char rname[NAME_ID_SIZE+1]; uint64_t sector_nr; uint64_t start_offset = 0; uint64_t dump_size = 0; uint64_t end_sector_nr = 0; int sector_size = 0; int align_size = 0; int sector_count, datalen, max_hosts; int i, j, rv, b; memset(&sd, 0, sizeof(struct sync_disk)); /* * /path[:[:]] * * If path contains a colon, the user would escape it with \\, e.g. * device named /dev/foo:32 using offset 0 and lenth 1M would be * /dev/foo\\:32:0:1M */ for (i = 0; i < strlen(dump_path); i++) { if (dump_path[i] == '\\') { i++; continue; } if (dump_path[i] == ':') { if (!colon1) colon1 = &dump_path[i]; else if (!colon2) colon2 = &dump_path[i]; } } if (colon1) { *colon1 = '\0'; off_str = colon1 + 1; if (colon2) { *colon2 = '\0'; size_str = colon2 + 1; } if ((m = strchr(off_str, 'M'))) { *m = '\0'; start_offset = atoll(off_str) * 1024 * 1024; } else { start_offset = atoll(off_str); } if (size_str) { if ((m = strchr(size_str, 'M'))) { *m = '\0'; dump_size = atoll(size_str) * 1024 * 1024; } else { dump_size = atoll(size_str); } } } if (start_offset % 1048576) printf("WARNING: dump offset should be a multiple of 1048576 bytes.\n"); sanlock_path_import(sd.path, dump_path, sizeof(sd.path)); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) { printf("Device %s not found.\n", sd.path); return -ENODEV; } if (com.sector_size) sector_size = com.sector_size; if (com.align_size) align_size = com.align_size; if (!sector_size || !align_size) { sd.offset = start_offset; for (i = 0; i < 1024; i++) { rv = direct_read_leader_sizes(task, &sd, §or_size, &align_size); if (sector_size && align_size) break; /* * search for a data structure that contains sector_size/align_size * every 1MB, up to 1GB or dump_size. */ sd.offset += 1048576; if (dump_size && (sd.offset >= (start_offset + dump_size))) break; } if (!sector_size || !align_size) { printf("Cannot find sector_size and align_size, set with -A and -Z.\n"); goto out_close; } } max_hosts = size_to_max_hosts(sector_size, align_size); sector_count = align_size / sector_size; datalen = align_size; data = malloc(datalen); if (!data) { rv = -ENOMEM; goto out_close; } memset(data, 0, datalen); printf("%8s %36s %48s %10s %4s %4s %s", "offset", "lockspace", "resource", "timestamp", "own", "gen", "lver"); if (force_mode) printf("/req/mode"); printf("\n"); sd.offset = start_offset; sector_nr = 0; if (dump_size) end_sector_nr = dump_size / sector_size; while (end_sector_nr == 0 || sector_nr < end_sector_nr) { memset(sname, 0, sizeof(rname)); memset(rname, 0, sizeof(rname)); memset(data, 0, sector_size); rv = read_sectors(&sd, sector_size, sector_nr, sector_count, data, datalen, task, DEFAULT_IO_TIMEOUT, "dump"); magic_in(data, &magic); if (magic == DELTA_DISK_MAGIC) { lr_end = (struct leader_record *)data; leader_record_in(lr_end, &lr_in); lr = &lr_in; for (i = 0; i < sector_count; i++) { lr_end = (struct leader_record *)(data + (i * sector_size)); if (!lr_end->magic) continue; leader_record_in(lr_end, &lr_in); lr = &lr_in; /* has never been acquired, don't print */ if (!lr->owner_id && !lr->owner_generation) continue; strncpy(sname, lr->space_name, NAME_ID_SIZE); strncpy(rname, lr->resource_name, NAME_ID_SIZE); printf("%08llu %36s %48s %010llu %04llu %04llu", (unsigned long long)(start_offset + ((sector_nr + i) * sector_size)), sname, rname, (unsigned long long)lr->timestamp, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation); if (force_mode) { bitmap = (char *)lr_end + LEADER_RECORD_MAX; for (b = 0; b < max_hosts; b++) { if (test_id_bit(b+1, bitmap)) printf(" %d", b+1); } } printf("\n"); } } else if (magic == PAXOS_DISK_MAGIC) { lr_end = (struct leader_record *)data; leader_record_in(lr_end, &lr_in); lr = &lr_in; strncpy(sname, lr->space_name, NAME_ID_SIZE); strncpy(rname, lr->resource_name, NAME_ID_SIZE); printf("%08llu %36s %48s %010llu %04llu %04llu %llu", (unsigned long long)(start_offset + (sector_nr * sector_size)), sname, rname, (unsigned long long)lr->timestamp, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation, (unsigned long long)lr->lver); if (force_mode) { struct request_record *rr_end = (struct request_record *)(data + sector_size); request_record_in(rr_end, &rr); printf("/%llu/%u", (unsigned long long)rr.lver, rr.force_mode); } printf("\n"); for (i = 0; i < lr->num_hosts; i++) { char *pd_end = data + ((2 + i) * sector_size); struct mode_block *mb_end = (struct mode_block *)(pd_end + MBLOCK_OFFSET); if (force_mode > 1) { paxos_dblock_in((struct paxos_dblock *)pd_end, &dblock); if (dblock.mbal || dblock.inp || dblock.lver) { printf("dblock[%04d] mbal %llu bal %llu inp %llu inp2 %llu inp3 %llu lver %llu sum %x\n", i, (unsigned long long)dblock.mbal, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3, (unsigned long long)dblock.lver, dblock.checksum); } } mode_block_in(mb_end, &mb); if (!(mb.flags & MBLOCK_SHARED)) continue; printf(" "); printf("%04u %04llu SH\n", i+1, (unsigned long long)mb.generation); } } else if (magic == RINDEX_DISK_MAGIC) { rh_end = (struct rindex_header *)data; rindex_header_in(rh_end, &rh_in); rh = &rh_in; strncpy(sname, rh->lockspace_name, NAME_ID_SIZE); printf("%08llu %36s rindex_header 0x%x %d %u %llu\n", (unsigned long long)(start_offset + (sector_nr * sector_size)), sname, rh->flags, rh->sector_size, rh->max_resources, (unsigned long long)rh->rx_offset); if (!force_mode) goto next; /* i begins with 1 to skip the first sector of the rindex which holds the header */ for (i = 1; i < sector_count; i++) { int entry_size = sizeof(struct rindex_entry); int entries_per_sector = sector_size / entry_size; for (j = 0; j < entries_per_sector; j++) { re_end = (struct rindex_entry *)(data + (i * sector_size) + (j * entry_size)); rindex_entry_in(re_end, &re_in); re = &re_in; if (!re->res_offset && !re->name[0]) continue; printf("%08llu %36s rentry %s %llu\n", (unsigned long long)(start_offset + ((sector_nr * sector_size) + (i * sector_size) + (j * entry_size))), sname, re->name, (unsigned long long)re->res_offset); } } } else { if (end_sector_nr == 0) break; } next: sector_nr += sector_count; } rv = 0; free(data); out_close: close_disks(&sd, 1); return rv; } int direct_next_free(struct task *task, char *path) { char *data; char *colon, *off_str; struct leader_record *lr_end; struct leader_record lr; struct sync_disk sd; uint64_t sector_nr; int sector_size, sector_count, datalen, align_size; int rv; memset(&sd, 0, sizeof(struct sync_disk)); colon = strstr(path, ":"); if (colon) { off_str = colon + 1; *colon = '\0'; sd.offset = atoll(off_str); } strncpy(sd.path, path, SANLK_PATH_LEN); sd.fd = -1; rv = open_disk(&sd); if (rv < 0) return -ENODEV; if (com.sector_size) sector_size = com.sector_size; if (com.align_size) align_size = com.align_size; if (!sector_size || !align_size) { rv = direct_read_leader_sizes(task, &sd, §or_size, &align_size); if (rv < 0) return rv; } sector_count = align_size / sector_size; datalen = sector_size; data = malloc(datalen); if (!data) { rv = -ENOMEM; goto out_close; } sector_nr = 0; rv = -ENOSPC; while (1) { memset(data, 0, sector_size); rv = read_sectors(&sd, sector_size, sector_nr, 1, data, datalen, task, DEFAULT_IO_TIMEOUT, "next_free"); lr_end = (struct leader_record *)data; leader_record_in(lr_end, &lr); if (lr.magic != DELTA_DISK_MAGIC && lr.magic != PAXOS_DISK_MAGIC && lr.magic != RINDEX_DISK_MAGIC) { printf("%llu\n", (unsigned long long)(sector_nr * sector_size)); rv = 0; goto out_free; } sector_nr += sector_count; } out_free: free(data); out_close: close_disks(&sd, 1); return rv; } int direct_rindex_format(struct task *task, struct sanlk_rindex *ri) { return rindex_format(task, ri); } int direct_rindex_rebuild(struct task *task, struct sanlk_rindex *ri, uint32_t cmd_flags) { return rindex_rebuild(task, ri, cmd_flags | SANLK_RX_NO_LOCKSPACE); } int direct_rindex_lookup(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, uint32_t cmd_flags) { struct sanlk_rentry re_ret; int rv; rv = rindex_lookup(task, ri, re, &re_ret, cmd_flags | SANLK_RX_NO_LOCKSPACE); if (!rv) memcpy(re, &re_ret, sizeof(re_ret)); return rv; } int direct_rindex_update(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, uint32_t cmd_flags) { struct sanlk_rentry re_ret; int rv; rv = rindex_update(task, ri, re, &re_ret, cmd_flags | SANLK_RX_NO_LOCKSPACE); if (!rv) memcpy(re, &re_ret, sizeof(re_ret)); return rv; } sanlock-3.8.2/src/direct.h000066400000000000000000000046221371427612200154010ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DIRECT_H__ #define __DIRECT_H__ int direct_acquire(struct task *task, int io_timeout, struct sanlk_resource *res, int num_hosts, uint64_t local_host_id, uint64_t local_host_generation, struct leader_record *leader_ret); int direct_release(struct task *task, int io_timeout, struct sanlk_resource *res, struct leader_record *leader_ret); int direct_acquire_id(struct task *task, int io_timeout, struct sanlk_lockspace *ls, char *our_host_name); int direct_release_id(struct task *task, int io_timeout, struct sanlk_lockspace *ls); int direct_renew_id(struct task *task, int io_timeout, struct sanlk_lockspace *ls); int direct_align(struct sync_disk *disk); /* io_timeout is written in the leader record and used for the write call itself */ int direct_write_lockspace(struct task *task, struct sanlk_lockspace *ls, uint32_t io_timeout); int direct_write_resource(struct task *task, struct sanlk_resource *res, int num_hosts, int write_clear); int direct_read_leader(struct task *task, int io_timeout, struct sanlk_lockspace *ls, struct sanlk_resource *res, struct leader_record *leader_ret); int direct_write_leader(struct task *task, int io_timeout, struct sanlk_lockspace *ls, struct sanlk_resource *res, struct leader_record *leader); int direct_dump(struct task *task, char *dump_path, int force_mode); int direct_next_free(struct task *task, char *path); int direct_rindex_format(struct task *task, struct sanlk_rindex *ri); int direct_rindex_rebuild(struct task *task, struct sanlk_rindex *ri, uint32_t cmd_flags); int direct_rindex_lookup(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, uint32_t cmd_flags); int direct_rindex_update(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, uint32_t cmd_flags); #endif sanlock-3.8.2/src/direct_lib.c000066400000000000000000000074671371427612200162340ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #define EXTERN #include "sanlock_internal.h" #include "sanlock_direct.h" #include "sanlock_admin.h" #include "diskio.h" #include "direct.h" #include "task.h" #include "timeouts.h" void log_level(uint32_t space_id GNUC_UNUSED, uint32_t res_id GNUC_UNUSED, char *name GNUC_UNUSED, int level GNUC_UNUSED, const char *fmt GNUC_UNUSED, ...); void log_level(uint32_t space_id GNUC_UNUSED, uint32_t res_id GNUC_UNUSED, char *name GNUC_UNUSED, int level GNUC_UNUSED, const char *fmt GNUC_UNUSED, ...) { } int lockspace_begin_rindex_op(char *space_name GNUC_UNUSED, int rindex_op GNUC_UNUSED, struct space_info *spi GNUC_UNUSED); int lockspace_begin_rindex_op(char *space_name GNUC_UNUSED, int rindex_op GNUC_UNUSED, struct space_info *spi GNUC_UNUSED) { return -1; } int lockspace_clear_rindex_op(char *space_name GNUC_UNUSED); int lockspace_clear_rindex_op(char *space_name GNUC_UNUSED) { return -1; } int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED); int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED) { return -1; } int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out); int host_info(char *space_name GNUC_UNUSED, uint64_t host_id GNUC_UNUSED, struct host_status *hs_out GNUC_UNUSED) { return -1; } struct token; void check_mode_block(struct token *token GNUC_UNUSED, int q GNUC_UNUSED, char *dblock GNUC_UNUSED); void check_mode_block(struct token *token GNUC_UNUSED, int q GNUC_UNUSED, char *dblock GNUC_UNUSED) { } /* copied from host_id.c */ int test_id_bit(int host_id, char *bitmap); int test_id_bit(int host_id, char *bitmap) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = (host_id - 1) % 8; char mask; mask = 1 << bit; return (*byte & mask); } int get_rand(int a, int b); int get_rand(int a, int b) { return a + (int) (((float)(b - a + 1)) * random() / (RAND_MAX+1.0)); } static void setup_task_lib(struct task *task, int use_aio) { memset(task, 0, sizeof(struct task)); setup_task_aio(task, use_aio, LIB_AIO_CB_SIZE); sprintf(task->name, "%s", "lib"); } int sanlock_direct_write_lockspace(struct sanlk_lockspace *ls, int max_hosts_unused, uint32_t flags GNUC_UNUSED, uint32_t io_timeout) { struct task task; int rv; setup_task_lib(&task, 1); rv = direct_write_lockspace(&task, ls, io_timeout); close_task_aio(&task); return rv; } int sanlock_direct_write_resource(struct sanlk_resource *res, int max_hosts_unused, int num_hosts, uint32_t flags) { struct task task; int rv; setup_task_lib(&task, 1); rv = direct_write_resource(&task, res, num_hosts, (flags & SANLK_WRITE_CLEAR) ? 1 : 0); close_task_aio(&task); return rv; } int sanlock_direct_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts_unused, int num_hosts, int use_aio) { struct task task; int rv; setup_task_lib(&task, use_aio); if (ls) rv = direct_write_lockspace(&task, ls, 0); else rv = direct_write_resource(&task, res, num_hosts, 0); close_task_aio(&task); return rv; } int sanlock_direct_align(struct sanlk_disk *disk_in) { struct sync_disk disk; int align_size, rv; memset(&disk, 0, sizeof(disk)); memcpy(disk.path, disk_in->path, SANLK_PATH_LEN); rv = open_disk(&disk); if (rv < 0) return rv; align_size = direct_align(&disk); close(disk.fd); return align_size; } sanlock-3.8.2/src/diskio.c000066400000000000000000000571711371427612200154130ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* posix aio */ #include /* posix aio */ #include "sanlock_internal.h" #include "diskio.h" #include "direct.h" #include "log.h" int read_sysfs_size(const char *disk_path, const char *name, unsigned int *val) { char path[PATH_MAX]; char buf[32]; struct stat st; int major, minor; size_t len; int fd; int rv = -1; rv = stat(disk_path, &st); if (rv < 0) return -1; major = (int)major(st.st_rdev); minor = (int)minor(st.st_rdev); snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/queue/%s", major, minor, name); fd = open(path, O_RDONLY, 0); if (fd < 0) return -1; rv = read(fd, buf, sizeof(buf)); if (rv < 0) { close(fd); return -1; } if ((len = strlen(buf)) && buf[len - 1] == '\n') buf[--len] = '\0'; if (strlen(buf)) { *val = atoi(buf); rv = 0; } close(fd); return rv; } static int write_sysfs_size(const char *disk_path, const char *name, unsigned int val) { char path[PATH_MAX]; char buf[32]; struct stat st; int major, minor; int fd; int rv; rv = stat(disk_path, &st); if (rv < 0) { log_debug("write_sysfs_size stat error %d %s", errno, disk_path); return -1; } major = (int)major(st.st_rdev); minor = (int)minor(st.st_rdev); snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/queue/%s", major, minor, name); memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%u", val); fd = open(path, O_RDWR, 0); if (fd < 0) { log_debug("write_sysfs_size open error %d %s", errno, path); return -1; } rv = write(fd, buf, strlen(buf)); if (rv < 0) { log_debug("write_sysfs_size write %s error %d %s", buf, errno, path); close(fd); return -1; } close(fd); return 0; } /* * The default max_sectors_kb is 512 (KB), so a 1MB read is split into two * 512KB reads. Adjust this to at least do 1MB io's. */ int set_max_sectors_kb(struct sync_disk *disk, uint32_t set_kb) { unsigned int max_kb = 0; int rv; rv = read_sysfs_size(disk->path, "max_sectors_kb", &max_kb); if (rv < 0) { log_debug("set_max_sectors_kb read error %d %s", rv, disk->path); return rv; } if (max_kb == set_kb) return 0; rv = write_sysfs_size(disk->path, "max_sectors_kb", set_kb); if (rv < 0) { log_debug("set_max_sectors_kb write %u error %d %s", set_kb, rv, disk->path); return rv; } return 0; } int get_max_sectors_kb(struct sync_disk *disk, uint32_t *max_sectors_kb) { unsigned int max = 0; int rv; rv = read_sysfs_size(disk->path, "max_sectors_kb", &max); if (!rv) *max_sectors_kb = max; return rv; } static int set_disk_properties(struct sync_disk *disk) { blkid_probe probe; uint32_t sector_size; probe = blkid_new_probe_from_filename(disk->path); if (!probe) { log_error("cannot get blkid probe %s", disk->path); return -1; } sector_size = blkid_probe_get_sectorsize(probe); blkid_free_probe(probe); disk->sector_size = sector_size; return 0; } void close_disks(struct sync_disk *disks, int num_disks) { int d; for (d = 0; d < num_disks; d++) { if (disks[d].fd == -1) continue; close(disks[d].fd); disks[d].fd = -1; } } int majority_disks(int num_disks, int num) { if (num_disks == 1 && !num) return 0; /* odd number of disks */ if (num_disks % 2) return num >= ((num_disks / 2) + 1); /* even number of disks */ if (num > (num_disks / 2)) return 1; if (num < (num_disks / 2)) return 0; /* TODO: half of disks are majority if tiebreaker disk is present */ return 0; } /* * set fd in each disk * returns 0 if majority of disks were opened successfully, -EXXX otherwise */ int open_disks_fd(struct sync_disk *disks, int num_disks) { struct sync_disk *disk; int num_opens = 0; int d, fd, rv = -1; for (d = 0; d < num_disks; d++) { disk = &disks[d]; if (disk->fd != -1) { log_error("open fd %d exists %s", disk->fd, disk->path); rv = -1; goto fail; } fd = open(disk->path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { rv = -errno; if (rv == -EACCES) { log_error("open error %d EACCES: no permission to open %s", rv, disk->path); log_error("check that daemon user %s %d group %s %d has access to disk or file.", com.uname, com.uid, com.gname, com.gid); } else log_error("open error %d %s", fd, disk->path); continue; } disk->fd = fd; num_opens++; } if (!majority_disks(num_disks, num_opens)) { /* rv is open errno */ goto fail; } return 0; fail: close_disks(disks, num_disks); return rv; } /* * set fd and sector_size * verify offset is correctly aligned * returns 0 for success or -EXXX */ int open_disk(struct sync_disk *disk) { struct stat st; int fd, rv; fd = open(disk->path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { rv = -errno; if (rv == -EACCES) { log_error("open error %d EACCES: no permission to open %s", rv, disk->path); log_error("check that daemon user %s %d group %s %d has access to disk or file.", com.uname, com.uid, com.gname, com.gid); } else log_error("open error %d %s", rv, disk->path); goto fail; } if (fstat(fd, &st) < 0) { rv = -errno; log_error("fstat error %d %s", rv, disk->path); close(fd); goto fail; } if (S_ISREG(st.st_mode)) { disk->sector_size = 512; } else { rv = set_disk_properties(disk); if (rv < 0) { close(fd); goto fail; } } disk->fd = fd; return 0; fail: if (rv >= 0) rv = -1; return rv; } /* * set fd and sector_size in each disk * verify all sector_size's match * returns 0 if majority of disks were opened successfully, -EXXX otherwise */ int open_disks(struct sync_disk *disks, int num_disks) { struct sync_disk *disk; int num_opens = 0; int d, err, rv = -1; uint32_t ss = 0; for (d = 0; d < num_disks; d++) { disk = &disks[d]; if (disk->fd != -1) { log_error("open fd %d exists %s", disk->fd, disk->path); rv = -ENOTEMPTY; goto fail; } err = open_disk(disk); if (err < 0) { rv = err; continue; } if (!ss) { ss = disk->sector_size; } else if (ss != disk->sector_size) { log_error("inconsistent sector sizes %u %u %s", ss, disk->sector_size, disk->path); } num_opens++; } if (!majority_disks(num_disks, num_opens)) { /* rv is from open err */ goto fail; } return 0; fail: close_disks(disks, num_disks); return rv; } static int do_write(int fd, uint64_t offset, const char *buf, int len, struct task *task, int *wr_ms) { off_t ret; int rv; int pos = 0; int sys_error = 0; int save_errno = 0; struct timespec begin, end, diff; const char *len_str; char off_str[16]; char ms_str[8]; if (task) task->io_count++; if (com.debug_io_submit) { len_str = align_size_debug_str(len); offset_to_str(offset, sizeof(off_str), off_str); if (len_str) log_taskd(task, "WR %s at %s", len_str, off_str); else log_taskd(task, "WR %d at %s", len, off_str); } ret = lseek(fd, offset, SEEK_SET); if (ret != offset) { log_taskw(task, "WR %d at %s seek error %llu", len, off_str, (unsigned long long)ret); return -1; } if (wr_ms) clock_gettime(CLOCK_MONOTONIC_RAW, &begin); retry: rv = write(fd, buf + pos, len); if (rv == -1 && errno == EINTR) goto retry; if (rv < 0) { sys_error = 1; save_errno = errno; goto out; } /* if (rv != len && len == sector_size) return error? partial sector writes should not happen AFAIK, and some uses depend on atomic single sector writes */ if (rv != len) { len -= rv; pos += rv; goto retry; } if (sys_error) rv = -1; else rv = 0; out: if (wr_ms) { clock_gettime(CLOCK_MONOTONIC_RAW, &end); ts_diff(&begin, &end, &diff); *wr_ms = (diff.tv_sec * 1000) + (diff.tv_nsec / 1000000); } if (com.debug_io_complete || sys_error) { len_str = align_size_debug_str(len); offset_to_str(offset, sizeof(off_str), off_str); if (wr_ms) { memset(ms_str, 0, sizeof(ms_str)); snprintf(ms_str, 7, "%u", *wr_ms); } if (len_str) log_taskd(task, "WR %s at %s done %s", len_str, off_str, wr_ms ? ms_str : ""); else log_taskd(task, "WR %d at %s done %s", len, off_str, wr_ms ? ms_str : ""); if (sys_error) log_taskw(task, "WR %d at %s error %d %s", len, off_str, save_errno, wr_ms ? ms_str : ""); } return rv; } static int do_read(int fd, uint64_t offset, char *buf, int len, struct task *task, int *rd_ms) { off_t ret; int rv, pos = 0; int sys_error = 0; int save_errno = 0; struct timespec begin, end, diff; const char *len_str; char off_str[16]; char ms_str[8]; if (task) task->io_count++; if (com.debug_io_submit) { len_str = align_size_debug_str(len); offset_to_str(offset, sizeof(off_str), off_str); if (len_str) log_taskd(task, "RD %s at %s", len_str, off_str); else log_taskd(task, "RD %d at %s", len, off_str); } ret = lseek(fd, offset, SEEK_SET); if (ret != offset) { log_taskw(task, "RD %d at %s seek error %llu", len, off_str, (unsigned long long)ret); return -1; } if (rd_ms) clock_gettime(CLOCK_MONOTONIC_RAW, &begin); while (pos < len) { rv = read(fd, buf + pos, len - pos); if (rv == 0) { sys_error = 1; save_errno = errno; break; } if (rv == -1 && errno == EINTR) continue; if (rv < 0) { sys_error = 1; save_errno = errno; break; } pos += rv; } if (sys_error) rv = -1; else rv = 0; if (rd_ms) { clock_gettime(CLOCK_MONOTONIC_RAW, &end); ts_diff(&begin, &end, &diff); *rd_ms = (diff.tv_sec * 1000) + (diff.tv_nsec / 1000000); } if (com.debug_io_complete || sys_error) { len_str = align_size_debug_str(len); offset_to_str(offset, sizeof(off_str), off_str); if (rd_ms) { memset(ms_str, 0, sizeof(ms_str)); snprintf(ms_str, 7, "%u", *rd_ms); } if (len_str) log_taskd(task, "RD %s at %s done %s", len_str, off_str, rd_ms ? ms_str : ""); else log_taskd(task, "RD %d at %s done %s", len, off_str, rd_ms ? ms_str : ""); if (sys_error) log_taskw(task, "RD %d at %s error %d %s", len, off_str, save_errno, rd_ms ? ms_str : ""); } return rv; } static struct aicb *find_callback_slot(struct task *task, int ioto) { struct timespec ts; struct io_event event; int cleared = 0; int rv; int i; find: for (i = 0; i < task->cb_size; i++) { if (task->callbacks[i].used) continue; return &task->callbacks[i]; } if (cleared++) return NULL; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = ioto; retry: memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) goto retry; if (rv < 0) return NULL; if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); int op = ev_iocb ? ev_iocb->aio_lio_opcode : -1; const char *op_str; if (op == IO_CMD_PREAD) op_str = "RD"; else if (op == IO_CMD_PWRITE) op_str = "WR"; else op_str = "UK"; log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld old free", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); ev_aicb->used = 0; free(ev_aicb->buf); ev_aicb->buf = NULL; goto find; } return NULL; } void offset_to_str(unsigned long long offset, int buflen, char *off_str) { uint64_t num_mb; if (!offset) { strncpy(off_str, "0", buflen); } else if (!(offset % 1048576)) { num_mb = offset / 1048576; snprintf(off_str, buflen, "%uM", (uint32_t)num_mb); } else { snprintf(off_str, buflen, "%llu", (unsigned long long)offset); } } /* * If this function returns SANLK_AIO_TIMEOUT, it means the io has timed out * and the event for the timed out io has not been reaped; the caller cannot * free the buf it passed in. It will be freed by a subsequent call when the * event is reaped. (Using my own error value here because I'm not certain * what values we might return from event.res.) */ static int do_linux_aio(int fd, uint64_t offset, char *buf, int len, struct task *task, int ioto, int cmd, int *ms) { struct timespec ts; struct aicb *aicb; struct iocb *iocb; struct io_event event; struct timespec begin, end, diff; const char *op_str; const char *len_str; char ms_str[8]; char off_str[16]; int rv; if (!ioto) { log_taske(task, "aio %d zero io timeout", cmd); return -EINVAL; } /* I expect this pre-emptively catches the io_submit EAGAIN case */ aicb = find_callback_slot(task, ioto); if (!aicb) return -ENOENT; iocb = &aicb->iocb; memset(iocb, 0, sizeof(struct iocb)); iocb->aio_fildes = fd; iocb->aio_lio_opcode = cmd; iocb->u.c.buf = buf; iocb->u.c.nbytes = len; iocb->u.c.offset = offset; if (cmd == IO_CMD_PREAD) op_str = "RD"; else if (cmd == IO_CMD_PWRITE) op_str = "WR"; else op_str = "UK"; if (com.debug_io_submit) { len_str = align_size_debug_str(len); offset_to_str(offset, sizeof(off_str), off_str); if (len_str) log_taskd(task, "%s %s at %s", op_str, len_str, off_str); else log_taskd(task, "%s %d at %s", op_str, len, off_str); } if (ms) clock_gettime(CLOCK_MONOTONIC_RAW, &begin); rv = io_submit(task->aio_ctx, 1, &iocb); if (rv < 0) { log_taske(task, "aio submit %d %p:%p:%p rv %d fd %d", cmd, aicb, iocb, buf, rv, fd); goto out; } task->io_count++; /* don't reuse aicb->iocb or free the buf until we reap the event */ aicb->used = 1; aicb->buf = buf; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = ioto; retry: memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) goto retry; if (rv < 0) { log_taske(task, "aio getevent %p:%p:%p rv %d", aicb, iocb, buf, rv); goto out; } if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); int op = ev_iocb ? ev_iocb->aio_lio_opcode : -1; if (op == IO_CMD_PREAD) op_str = "RD"; else if (op == IO_CMD_PWRITE) op_str = "WR"; else op_str = "UK"; if (ms) { clock_gettime(CLOCK_MONOTONIC_RAW, &end); ts_diff(&begin, &end, &diff); *ms = (diff.tv_sec * 1000) + (diff.tv_nsec / 1000000); } ev_aicb->used = 0; if (ev_iocb != iocb) { log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld other free", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); free(ev_aicb->buf); ev_aicb->buf = NULL; goto retry; } if ((int)event.res < 0) { log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match res", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = event.res; goto out; } if (event.res != len) { log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match len %d", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, len); rv = -EMSGSIZE; goto out; } /* standard success case */ if (com.debug_io_complete) { len_str = align_size_debug_str(len); offset_to_str(offset, sizeof(off_str), off_str); if (ms) { memset(ms_str, 0, sizeof(ms_str)); snprintf(ms_str, 7, "%u", *ms); } if (len_str) log_taskd(task, "%s %s at %s done %s", op_str, len_str, off_str, ms ? ms_str : ""); else log_taskd(task, "%s %d at %s done %s", op_str, len, off_str, ms ? ms_str : ""); } rv = 0; goto out; } /* Timed out waiting for result. If cancel fails, we could try retry io_getevents indefinately, but that removes the whole point of using aio, which is the timeout. So, we need to be prepared to reap the event the next time we call io_getevents for a different i/o. We can't reuse the iocb for this timed out io until we get an event for it because we need to compare the iocb to event.obj to distinguish events for separate submissions. dct: io_cancel doesn't work, in general. you are very likely going to get -EINVAL from that call */ task->to_count++; if (cmd == IO_CMD_PREAD) op_str = "RD"; else if (cmd == IO_CMD_PWRITE) op_str = "WR"; else op_str = "UK"; log_taskw(task, "aio timeout %s %p:%p:%p ioto %d to_count %d", op_str, aicb, iocb, buf, ioto, task->to_count); rv = io_cancel(task->aio_ctx, iocb, &event); if (!rv) { aicb->used = 0; rv = -ECANCELED; } else { /* aicb->used and aicb->buf both remain set */ rv = SANLK_AIO_TIMEOUT; if (cmd == IO_CMD_PREAD) task->read_iobuf_timeout_aicb = aicb; } out: return rv; } static int do_write_aio_linux(int fd, uint64_t offset, char *buf, int len, struct task *task, int ioto, int *wr_ms) { return do_linux_aio(fd, offset, buf, len, task, ioto, IO_CMD_PWRITE, wr_ms); } static int do_read_aio_linux(int fd, uint64_t offset, char *buf, int len, struct task *task, int ioto, int *rd_ms) { return do_linux_aio(fd, offset, buf, len, task, ioto, IO_CMD_PREAD, rd_ms); } /* write aligned io buffer */ int write_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task, int ioto, int *wr_ms) { if (task && task->use_aio) return do_write_aio_linux(fd, offset, iobuf, iobuf_len, task, ioto, wr_ms); else return do_write(fd, offset, iobuf, iobuf_len, task, wr_ms); } static int _write_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count GNUC_UNUSED, const char *data, int data_len, int iobuf_len, struct task *task, int ioto, const char *blktype) { char *iobuf, **p_iobuf; uint64_t offset; int rv; offset = disk->offset + (sector_nr * sector_size); p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) { log_error("write_sectors %s posix_memalign rv %d %s", blktype, rv, disk->path); rv = -ENOMEM; goto out; } memset(iobuf, 0, iobuf_len); memcpy(iobuf, data, data_len); rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, ioto, NULL); if (rv < 0) { log_error("write_sectors %s offset %llu rv %d %s", blktype, (unsigned long long)offset, rv, disk->path); } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); out: return rv; } /* sector_nr is logical sector number within the sync_disk. the sync_disk itself begins at disk->offset (in bytes) from the start of the block device identified by disk->path, data_len must be <= sector_size */ int write_sector(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, const char *data, int data_len, struct task *task, int ioto, const char *blktype) { int iobuf_len = sector_size; if ((sector_size != 4096) && (sector_size != 512)) { log_error("write_sector bad sector_size %d", sector_size); return -EINVAL; } if (data_len > iobuf_len) { log_error("write_sector %s data_len %d max %d %s", blktype, data_len, iobuf_len, disk->path); return -1; } return _write_sectors(disk, sector_size, sector_nr, 1, data, data_len, iobuf_len, task, ioto, blktype); } /* write multiple complete sectors, data_len must be multiple of sector size */ int write_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, const char *data, int data_len, struct task *task, int ioto, const char *blktype) { int iobuf_len = data_len; if ((sector_size != 4096) && (sector_size != 512)) { log_error("write_sectors bad sector_size %d", sector_size); return -EINVAL; } if (data_len != sector_count * sector_size) { log_error("write_sectors %s data_len %d sector_count %d %s", blktype, data_len, sector_count, disk->path); return -1; } return _write_sectors(disk, sector_size, sector_nr, sector_count, data, data_len, iobuf_len, task, ioto, blktype); } /* read aligned io buffer */ int read_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task, int ioto, int *rd_ms) { if (task && task->use_aio) return do_read_aio_linux(fd, offset, iobuf, iobuf_len, task, ioto, rd_ms); else return do_read(fd, offset, iobuf, iobuf_len, task, rd_ms); } /* read sector_count sectors starting with sector_nr, where sector_nr is a logical sector number within the sync_disk. the caller will generally want to look at the first N bytes of each sector. when reading multiple sectors, data_len will generally equal iobuf_len, but when reading one sector, data_len may be less than iobuf_len. */ int read_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, char *data, int data_len, struct task *task, int ioto, const char *blktype) { char *iobuf, **p_iobuf; uint64_t offset; int iobuf_len; int rv; if ((sector_size != 512) && (sector_size != 4096)) { log_error("read_sectors %s bad sector_size %d", blktype, sector_size); return -EINVAL; } iobuf_len = sector_count * sector_size; offset = disk->offset + (sector_nr * sector_size); p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) { log_error("read_sectors %s posix_memalign rv %d %s", blktype, rv, disk->path); rv = -ENOMEM; goto out; } memset(iobuf, 0, iobuf_len); rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, ioto, NULL); if (!rv) { memcpy(data, iobuf, data_len); } else { log_error("read_sectors %s offset %llu rv %d %s", blktype, (unsigned long long)offset, rv, disk->path); } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); out: return rv; } /* Try to reap the event of a previously timed out read_iobuf. The aicb used in a task's last timed out read_iobuf is task->read_iobuf_timeout_aicb . */ int read_iobuf_reap(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task, uint32_t ioto_msec) { struct timespec ts; struct aicb *aicb; struct iocb *iocb; struct io_event event; int rv; aicb = task->read_iobuf_timeout_aicb; iocb = &aicb->iocb; if (!aicb->used) return -EINVAL; if (iocb->aio_fildes != fd) return -EINVAL; if (iocb->u.c.buf != iobuf) return -EINVAL; if (iocb->u.c.nbytes != iobuf_len) return -EINVAL; if (iocb->u.c.offset != offset) return -EINVAL; if (iocb->aio_lio_opcode != IO_CMD_PREAD) return -EINVAL; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = ioto_msec / 1000; ts.tv_nsec = (ioto_msec % 1000) * 1000000; retry: memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) goto retry; if (rv < 0) { log_taske(task, "aio getevent %p:%p:%p rv %d r", aicb, iocb, iobuf, rv); goto out; } if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); int op = ev_iocb ? ev_iocb->aio_lio_opcode : -1; const char *op_str; if (op == IO_CMD_PREAD) op_str = "RD"; else if (op == IO_CMD_PWRITE) op_str = "WR"; else op_str = "UK"; ev_aicb->used = 0; if (ev_iocb != iocb) { log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld other free r", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); free(ev_aicb->buf); ev_aicb->buf = NULL; goto retry; } if ((int)event.res < 0) { log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match res r", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = event.res; goto out; } if (event.res != iobuf_len) { log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match len %d r", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2, iobuf_len); rv = -EMSGSIZE; goto out; } log_taskw(task, "aio collect %s %p:%p:%p result %ld:%ld match reap", op_str, ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); rv = 0; goto out; } /* timed out again */ rv = SANLK_AIO_TIMEOUT; out: return rv; } sanlock-3.8.2/src/diskio.h000066400000000000000000000040161371427612200154060ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __DISKIO_H__ #define __DISKIO_H__ void offset_to_str(unsigned long long offset, int buflen, char *off_str); void close_disks(struct sync_disk *disks, int num_disks); int open_disk(struct sync_disk *disks); int open_disks(struct sync_disk *disks, int num_disks); int open_disks_fd(struct sync_disk *disks, int num_disks); int majority_disks(int num_disks, int num); int read_sysfs_size(const char *path, const char *name, unsigned int *val); int set_max_sectors_kb(struct sync_disk *disk, uint32_t max_sectors_kb); int get_max_sectors_kb(struct sync_disk *disk, uint32_t *max_sectors_kb); /* * iobuf functions require the caller to allocate iobuf using posix_memalign * and pass it into the function */ int write_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task, int ioto, int *wr_ms); int read_iobuf(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task, int ioto, int *rd_ms); int read_iobuf_reap(int fd, uint64_t offset, char *iobuf, int iobuf_len, struct task *task, uint32_t ioto_msec); /* * sector functions allocate an iobuf themselves, copy into it for read, use it * for io, copy out of it for write, and free it */ int write_sector(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, const char *data, int data_len, struct task *task, int ioto, const char *blktype); int write_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, const char *data, int data_len, struct task *task, int ioto, const char *blktype); int read_sectors(const struct sync_disk *disk, int sector_size, uint64_t sector_nr, uint32_t sector_count, char *data, int data_len, struct task *task, int ioto, const char *blktype); #endif sanlock-3.8.2/src/env.c000066400000000000000000000011731371427612200147100ustar00rootroot00000000000000/* * Copyright 2018 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include "env.h" const char *env_get(const char *key, const char *defval) { const char *val; val = getenv(key); if (val == NULL) return defval; return val; } int env_get_bool(const char *key, int defval) { const char *val; val = getenv(key); if (val == NULL) return defval; return strcmp(val, "1") ? 0 : 1; } sanlock-3.8.2/src/env.h000066400000000000000000000006411371427612200147140ustar00rootroot00000000000000/* * Copyright 2018 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __ENV_H__ #define __ENV_H__ const char *env_get(const char *key, const char *defval); int env_get_bool(const char *key, int defval); #endif sanlock-3.8.2/src/helper.c000066400000000000000000000115301371427612200153750ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "monotime.h" #include "helper.h" #define MAX_AV_COUNT 8 static void run_path(struct helper_msg *hm) { char arg[SANLK_HELPER_ARGS_LEN]; char *args = hm->args; char *av[MAX_AV_COUNT + 1]; /* +1 for NULL */ int av_count = 0; int i, arg_len, args_len; for (i = 0; i < MAX_AV_COUNT + 1; i++) av[i] = NULL; av[av_count++] = strdup(hm->path); if (!args[0]) goto pid_arg; /* this should already be done, but make sure */ args[SANLK_HELPER_ARGS_LEN - 1] = '\0'; memset(&arg, 0, sizeof(arg)); arg_len = 0; args_len = strlen(args); for (i = 0; i < args_len; i++) { if (!args[i]) break; if (av_count == MAX_AV_COUNT) break; if (args[i] == '\\') { if (i == (args_len - 1)) break; i++; if (args[i] == '\\') { arg[arg_len++] = args[i]; continue; } if (isspace(args[i])) { arg[arg_len++] = args[i]; continue; } else { break; } } if (isalnum(args[i]) || ispunct(args[i])) { arg[arg_len++] = args[i]; } else if (isspace(args[i])) { if (arg_len) av[av_count++] = strdup(arg); memset(arg, 0, sizeof(arg)); arg_len = 0; } else { break; } } if ((av_count < MAX_AV_COUNT) && arg_len) { av[av_count++] = strdup(arg); } pid_arg: if ((av_count < MAX_AV_COUNT) && hm->pid) { memset(arg, 0, sizeof(arg)); snprintf(arg, sizeof(arg)-1, "%d", hm->pid); av[av_count++] = strdup(arg); } execvp(av[0], av); } static int read_hm(int fd, struct helper_msg *hm) { int rv; retry: rv = read(fd, hm, sizeof(struct helper_msg)); if (rv == -1 && errno == EINTR) goto retry; if (rv != sizeof(struct helper_msg)) return -1; return 0; } static int send_status(int fd) { struct helper_status hs; int rv; memset(&hs, 0, sizeof(hs)); hs.type = HELPER_STATUS; rv = write(fd, &hs, sizeof(hs)); if (rv == sizeof(hs)) return 0; return -1; } #define log_debug(fmt, args...) \ do { \ if (log_stderr) \ fprintf(stderr, "%ld " fmt "\n", time(NULL), ##args); \ } while (0) #define STANDARD_TIMEOUT_MS (HELPER_STATUS_INTERVAL*1000) #define RECOVERY_TIMEOUT_MS 1000 int run_helper(int in_fd, int out_fd, int log_stderr) { char name[16]; struct pollfd pollfd; struct helper_msg hm; unsigned int fork_count = 0; unsigned int wait_count = 0; time_t now, last_send, last_good = 0; int timeout = STANDARD_TIMEOUT_MS; int rv, pid, status; memset(name, 0, sizeof(name)); sprintf(name, "%s", "sanlock-helper"); prctl(PR_SET_NAME, (unsigned long)name, 0, 0, 0); rv = setgroups(0, NULL); if (rv < 0) log_debug("error clearing helper groups errno %i", errno); memset(&pollfd, 0, sizeof(pollfd)); pollfd.fd = in_fd; pollfd.events = POLLIN; now = monotime(); last_send = now; rv = send_status(out_fd); if (!rv) last_good = now; while (1) { rv = poll(&pollfd, 1, timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) exit(0); now = monotime(); if (now - last_good >= HELPER_STATUS_INTERVAL && now - last_send >= 2) { last_send = now; rv = send_status(out_fd); if (!rv) last_good = now; } memset(&hm, 0, sizeof(hm)); if (pollfd.revents & POLLIN) { rv = read_hm(in_fd, &hm); if (rv) continue; if (hm.type == HELPER_MSG_RUNPATH) { pid = fork(); if (!pid) { run_path(&hm); exit(-1); } fork_count++; /* log_debug("helper fork %d count %d %d %s %s", pid, fork_count, wait_count, hm.path, hm.args); */ } else if (hm.type == HELPER_MSG_KILLPID) { kill(hm.pid, hm.sig); } } if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) exit(0); /* collect child exits until no more children exist (ECHILD) or none are ready (WNOHANG) */ while (1) { rv = waitpid(-1, &status, WNOHANG); if (rv > 0) { wait_count++; /* log_debug("helper wait %d count %d %d", rv, fork_count, wait_count); */ continue; } /* no more children to wait for or no children have exited */ if (rv < 0 && errno == ECHILD) { if (timeout == RECOVERY_TIMEOUT_MS) { log_debug("helper no children count %d %d", fork_count, wait_count); } timeout = STANDARD_TIMEOUT_MS; } else { timeout = RECOVERY_TIMEOUT_MS; } break; } } return 0; } sanlock-3.8.2/src/helper.h000066400000000000000000000016251371427612200154060ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __HELPER_H__ #define __HELPER_H__ /* * helper process * recvs 512 byte helper_msg on in_fd * sends 4 byte helper_status on out_fd */ #define SANLK_HELPER_MSG_LEN 512 #define HELPER_MSG_RUNPATH 1 #define HELPER_MSG_KILLPID 2 struct helper_msg { uint8_t type; uint8_t pad1; uint16_t pad2; uint32_t flags; int pid; int sig; char path[SANLK_HELPER_PATH_LEN]; /* 128 */ char args[SANLK_HELPER_ARGS_LEN]; /* 128 */ char pad[240]; }; #define HELPER_STATUS_INTERVAL 30 #define HELPER_STATUS 1 struct helper_status { uint8_t type; uint8_t status; uint16_t len; }; int run_helper(int in_fd, int out_fd, int log_stderr); #endif sanlock-3.8.2/src/leader.h000066400000000000000000000057341371427612200153700ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LEADER_H__ #define __LEADER_H__ /* does not include terminating null byte */ /* NB NAME_ID_SIZE must match SANLK_NAME_LEN */ /* NB NAME_ID_SIZE is part of ondisk format */ #define NAME_ID_SIZE 48 /* * paxos minor version changes: * 4: add ALIGN leader flags * * delta minor version changes: * 4: add ALIGN leader flags */ #define PAXOS_DISK_MAGIC 0x06152010 #define PAXOS_DISK_CLEAR 0x11282016 #define PAXOS_DISK_VERSION_MAJOR 0x00060000 #define PAXOS_DISK_VERSION_MINOR 0x00000004 #define DELTA_DISK_MAGIC 0x12212010 #define DELTA_DISK_VERSION_MAJOR 0x00030000 #define DELTA_DISK_VERSION_MINOR 0x00000004 /* for all disk structures: uint64 aligned on 8 byte boundaries, uint32 aligned on 4 byte boundaries, etc */ /* NB. adjust LEADER_COMPARE_LEN and LEADER_CHECKSUM_LEN when changing this struct. LEADER_CHECKSUM_LEN should end just before the checksum field. LEADER_COMPARE_LEN should end just before timestamp. The checksum field should follow the timestamp field. The leader may be partially through updating the timestamp on multiple leader blocks in a lease, but for the purpose of counting repetitions of a leader block owned by a single host they should be counted together, so COMPARE_LEN should exclude timestamp. */ #define LEADER_COMPARE_LEN 152 #define LEADER_CHECKSUM_LEN 168 #define LEASE_FREE 0 /* leader_record flags */ #define LFL_SHORT_HOLD 0x00000001 /* skip ahead in flag numbers so these align flags match other defines */ #define LFL_ALIGN_1M 0x00000010 #define LFL_ALIGN_2M 0x00000020 #define LFL_ALIGN_4M 0x00000040 #define LFL_ALIGN_8M 0x00000080 struct leader_record { uint32_t magic; uint32_t version; uint32_t flags; uint32_t sector_size; uint64_t num_hosts; uint64_t max_hosts; uint64_t owner_id; /* host_id of owner */ uint64_t owner_generation; uint64_t lver; char space_name[NAME_ID_SIZE]; /* lockspace for resource */ char resource_name[NAME_ID_SIZE]; /* resource being locked */ uint64_t timestamp; uint64_t unused1; uint32_t checksum; uint16_t unused2; uint16_t io_timeout; uint64_t write_id; /* for extra info, debug */ uint64_t write_generation; /* for extra info, debug */ uint64_t write_timestamp; /* for extra info, debug */ }; /* leader_record can use first 256 bytes of a sector, bitmap uses the last 256 bytes */ #define LEADER_RECORD_MAX 256 #define HOSTID_BITMAP_OFFSET 256 #define HOSTID_BITMAP_SIZE 256 /* the request record is in the sector following the leader record for a paxos lease. */ #define REQ_DISK_MAGIC 0x08292011 #define REQ_DISK_VERSION_MAJOR 0x00010000 #define REQ_DISK_VERSION_MINOR 0x00000001 struct request_record { uint32_t magic; uint32_t version; uint64_t lver; uint32_t force_mode; }; #endif sanlock-3.8.2/src/libsanlock.pc.in000066400000000000000000000004011371427612200170170ustar00rootroot00000000000000prefix=/usr exec_prefix=${prefix} includedir=${prefix}/include libdir=${exec_prefix}/lib64 Name: libsanlock Description: The sanlock library Version: @VERSION@ Cflags: -I${includedir} Libs: -L${libdir} -lpthread -lrt -laio -lblkid -luuid -lwdmd -lsanlock sanlock-3.8.2/src/libsanlock_client.pc.in000066400000000000000000000003531371427612200203630ustar00rootroot00000000000000prefix=/usr exec_prefix=${prefix} includedir=${prefix}/include libdir=${exec_prefix}/lib64 Name: libsanlock_client Description: The sanlock client library Version: @VERSION@ Cflags: -I${includedir} Libs: -L${libdir} -lsanlock_client sanlock-3.8.2/src/list.h000066400000000000000000000421421371427612200151010ustar00rootroot00000000000000/* Copied from linux kernel */ #ifndef _LINUX_LIST_H #define _LINUX_LIST_H /* * Simple doubly linked list implementation. * * Some of the internal functions ("__xxx") are useful when * manipulating whole lists rather than single entries, as * sometimes we already know the next/prev entries and we can * generate better code by using them directly rather than * using the generic single-entry routines. */ /** * container_of - cast a member of a structure out to the containing structure * * @ptr: the pointer to the member. * @type: the type of the container struct this is embedded in. * @member: the name of the member within the struct. * */ #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) #define LIST_POISON1 ((void *) 0x00100100) #define LIST_POISON2 ((void *) 0x00200200) struct list_head { struct list_head *next, *prev; }; #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ struct list_head name = LIST_HEAD_INIT(name) static inline void INIT_LIST_HEAD(struct list_head *list) { list->next = list; list->prev = list; } /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; } /** * list_add - add a new entry * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. */ static inline void list_add(struct list_head *new, struct list_head *head) { __list_add(new, head, head->next); } /** * list_add_tail - add a new entry * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. */ static inline void list_add_tail(struct list_head *new, struct list_head *head) { __list_add(new, head->prev, head); } /* * Delete a list entry by making the prev/next entries * point to each other. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; prev->next = next; } /** * list_del - deletes entry from list. * @entry: the element to delete from the list. * Note: list_empty() on entry does not return true after this, the entry is * in an undefined state. */ static inline void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); entry->next = LIST_POISON1; entry->prev = LIST_POISON2; } /** * list_replace - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * If @old was empty, it will be overwritten. */ static inline void list_replace(struct list_head *old, struct list_head *new) { new->next = old->next; new->next->prev = new; new->prev = old->prev; new->prev->next = new; } static inline void list_replace_init(struct list_head *old, struct list_head *new) { list_replace(old, new); INIT_LIST_HEAD(old); } /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. */ static inline void list_del_init(struct list_head *entry) { __list_del(entry->prev, entry->next); INIT_LIST_HEAD(entry); } /** * list_move - delete from one list and add as another's head * @list: the entry to move * @head: the head that will precede our entry */ static inline void list_move(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add(list, head); } /** * list_move_tail - delete from one list and add as another's tail * @list: the entry to move * @head: the head that will follow our entry */ static inline void list_move_tail(struct list_head *list, struct list_head *head) { __list_del(list->prev, list->next); list_add_tail(list, head); } /** * list_is_last - tests whether @list is the last entry in list @head * @list: the entry to test * @head: the head of the list */ static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } /** * list_empty - tests whether a list is empty * @head: the list to test. */ static inline int list_empty(const struct list_head *head) { return head->next == head; } /** * list_empty_careful - tests whether a list is empty and not being modified * @head: the list to test * * Description: * tests whether a list is empty _and_ checks that no other CPU might be * in the process of modifying either member (next or prev) * * NOTE: using list_empty_careful() without synchronization * can only be safe if the only activity that can happen * to the list entry is list_del_init(). Eg. it cannot be used * if another CPU could re-list_add() it. */ static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = head->next; return (next == head) && (next == head->prev); } /** * list_rotate_left - rotate the list to the left * @head: the head of the list */ static inline void list_rotate_left(struct list_head *head) { struct list_head *first; if (!list_empty(head)) { first = head->next; list_move_tail(first, head); } } /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. */ static inline int list_is_singular(const struct list_head *head) { return !list_empty(head) && (head->next == head->prev); } static inline void __list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { struct list_head *new_first = entry->next; list->next = head->next; list->next->prev = list; list->prev = entry; entry->next = list; head->next = new_first; new_first->prev = head; } /** * list_cut_position - cut a list into two * @list: a new list to add all removed entries * @head: a list with entries * @entry: an entry within head, could be the head itself * and if so we won't cut the list * * This helper moves the initial part of @head, up to and * including @entry, from @head to @list. You should * pass on @entry an element you know is on @head. @list * should be an empty list or a list you do not care about * losing its data. * */ static inline void list_cut_position(struct list_head *list, struct list_head *head, struct list_head *entry) { if (list_empty(head)) return; if (list_is_singular(head) && (head->next != entry && head != entry)) return; if (entry == head) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); } static inline void __list_splice(const struct list_head *list, struct list_head *prev, struct list_head *next) { struct list_head *first = list->next; struct list_head *last = list->prev; first->prev = prev; prev->next = first; last->next = next; next->prev = last; } /** * list_splice - join two lists, this is designed for stacks * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice(const struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head, head->next); } /** * list_splice_tail - join two lists, each list being a queue * @list: the new list to add. * @head: the place to add it in the first list. */ static inline void list_splice_tail(struct list_head *list, struct list_head *head) { if (!list_empty(list)) __list_splice(list, head->prev, head); } /** * list_splice_init - join two lists and reinitialise the emptied list. * @list: the new list to add. * @head: the place to add it in the first list. * * The list at @list is reinitialised */ static inline void list_splice_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head, head->next); INIT_LIST_HEAD(list); } } /** * list_splice_tail_init - join two lists and reinitialise the emptied list * @list: the new list to add. * @head: the place to add it in the first list. * * Each of the lists is a queue. * The list at @list is reinitialised */ static inline void list_splice_tail_init(struct list_head *list, struct list_head *head) { if (!list_empty(list)) { __list_splice(list, head->prev, head); INIT_LIST_HEAD(list); } } /** * list_entry - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. */ #define list_entry(ptr, type, member) \ container_of(ptr, type, member) /** * list_first_entry - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_struct within the struct. * * Note, that list is expected to be not empty. */ #define list_first_entry(ptr, type, member) \ list_entry((ptr)->next, type, member) /** * list_last_entry - get the last element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note, that list is expected to be not empty. */ #define list_last_entry(ptr, type, member) \ list_entry((ptr)->prev, type, member) /** * list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * __list_for_each - iterate over a list * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. * * This variant differs from list_for_each() in that it's the * simplest possible list iteration code, no prefetching is done. * Use this for code that knows the list to be very short (empty * or 1 entry) most of the time. */ #define __list_for_each(pos, head) \ for (pos = (head)->next; pos != (head); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards * @pos: the &struct list_head to use as a loop cursor. * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ for (pos = (head)->prev; pos != (head); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ for (pos = (head)->next, n = pos->next; pos != (head); \ pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry * @pos: the &struct list_head to use as a loop cursor. * @n: another &struct list_head to use as temporary storage * @head: the head for your list. */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ pos != (head); \ pos = n, n = pos->prev) /** * list_for_each_entry - iterate over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry(pos, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_reverse - iterate backwards over list of given type. * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_reverse(pos, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) /** * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() * @pos: the type * to use as a start point * @head: the head of the list * @member: the name of the list_struct within the struct. * * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). */ #define list_prepare_entry(pos, head, member) \ ((pos) ? : list_entry(head, typeof(*pos), member)) /** * list_for_each_entry_continue - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Continue to iterate over list of given type, continuing after * the current position. */ #define list_for_each_entry_continue(pos, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_reverse - iterate backwards from the given point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Start to iterate over list of given type backwards, continuing after * the current position. */ #define list_for_each_entry_continue_reverse(pos, head, member) \ for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry(pos->member.prev, typeof(*pos), member)) /** * list_for_each_entry_from - iterate over list of given type from the current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing from current position. */ #define list_for_each_entry_from(pos, head, member) \ for (; &pos->member != (head); \ pos = list_entry(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. */ #define list_for_each_entry_safe(pos, n, head, member) \ for (pos = list_entry((head)->next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_continue - continue list iteration safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type, continuing after current point, * safe against removal of list entry. */ #define list_for_each_entry_safe_continue(pos, n, head, member) \ for (pos = list_entry(pos->member.next, typeof(*pos), member), \ n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_from - iterate over list from current point safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate over list of given type from current point, safe against * removal of list entry. */ #define list_for_each_entry_safe_from(pos, n, head, member) \ for (n = list_entry(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.next, typeof(*n), member)) /** * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @head: the head for your list. * @member: the name of the list_struct within the struct. * * Iterate backwards over list of given type, safe against removal * of list entry. */ #define list_for_each_entry_safe_reverse(pos, n, head, member) \ for (pos = list_entry((head)->prev, typeof(*pos), member), \ n = list_entry(pos->member.prev, typeof(*pos), member); \ &pos->member != (head); \ pos = n, n = list_entry(n->member.prev, typeof(*n), member)) /** * list_safe_reset_next - reset a stale list_for_each_entry_safe loop * @pos: the loop cursor used in the list_for_each_entry_safe loop * @n: temporary storage used in list_for_each_entry_safe * @member: the name of the list_struct within the struct. * * list_safe_reset_next is not safe to use in general if the list may be * modified concurrently (eg. the lock is dropped in the loop body). An * exception to this is if the cursor element (pos) is pinned in the list, * and list_safe_reset_next is called after re-taking the lock and before * completing the current iteration of the loop body. */ #define list_safe_reset_next(pos, n, member) \ n = list_entry(pos->member.next, typeof(*pos), member) #endif sanlock-3.8.2/src/lockfile.c000066400000000000000000000037671371427612200157230ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #include "lockfile.h" int lockfile(const char *dir, const char *name, int uid, int gid) { char path[PATH_MAX]; char buf[16]; struct flock lock; mode_t old_umask; int fd, rv; /* Make rundir group writable, allowing creation of the lockfile when * starting as root. */ old_umask = umask(0002); rv = mkdir(dir, 0775); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } umask(old_umask); rv = chown(dir, uid, gid); if (rv < 0) { log_error("lockfile chown error %s: %s", dir, strerror(errno)); return rv; } snprintf(path, PATH_MAX, "%s/%s", dir, name); fd = open(path, O_CREAT|O_WRONLY|O_CLOEXEC, 0644); if (fd < 0) { log_error("lockfile open error %s: %s", path, strerror(errno)); return -1; } lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; rv = fcntl(fd, F_SETLK, &lock); if (rv < 0) { log_error("lockfile setlk error %s: %s", path, strerror(errno)); goto fail; } rv = ftruncate(fd, 0); if (rv < 0) { log_error("lockfile truncate error %s: %s", path, strerror(errno)); goto fail; } memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%d\n", getpid()); rv = write(fd, buf, strlen(buf)); if (rv <= 0) { log_error("lockfile write error %s: %s", path, strerror(errno)); goto fail; } return fd; fail: close(fd); return -1; } sanlock-3.8.2/src/lockfile.h000066400000000000000000000006121371427612200157120ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LOCKFILE_H__ #define __LOCKFILE_H__ int lockfile(const char *dir, const char *name, int uid, int gid); #endif sanlock-3.8.2/src/lockspace.c000066400000000000000000001452751371427612200161000ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock_admin.h" #include "sanlock_sock.h" #include "diskio.h" #include "ondisk.h" #include "log.h" #include "delta_lease.h" #include "lockspace.h" #include "resource.h" #include "watchdog.h" #include "task.h" #include "timeouts.h" #include "direct.h" static uint32_t space_id_counter = 1; static struct space *_search_space(const char *name, struct sync_disk *disk, uint64_t host_id, struct list_head *head1, struct list_head *head2, struct list_head *head3, int *listnum) { int i; struct space *sp; struct list_head *heads[] = {head1, head2, head3}; for (i = 0; i < 3; i++) { if (!heads[i]) { continue; } list_for_each_entry(sp, heads[i], list) { if (name && strncmp(sp->space_name, name, NAME_ID_SIZE)) continue; if (disk && strncmp(sp->host_id_disk.path, disk->path, SANLK_PATH_LEN)) continue; if (disk && sp->host_id_disk.offset != disk->offset) continue; if (host_id && sp->host_id != host_id) continue; if (listnum) *listnum = i+1; return sp; } } return NULL; } struct space *find_lockspace(const char *name) { return _search_space(name, NULL, 0, &spaces, &spaces_rem, &spaces_add, NULL); } static struct space *find_lockspace_id(uint32_t space_id) { struct space *sp; list_for_each_entry(sp, &spaces, list) { if (sp->space_id == space_id) return sp; } return NULL; } static void _set_space_info(struct space *sp, struct space_info *spi) { /* keep this in sync with any new fields added to struct space_info */ spi->space_id = sp->space_id; spi->io_timeout = sp->io_timeout; spi->sector_size = sp->sector_size; spi->align_size = sp->align_size; spi->host_id = sp->host_id; spi->host_generation = sp->host_generation; spi->killing_pids = sp->killing_pids; } int _lockspace_info(const char *space_name, struct space_info *spi) { struct space *sp; list_for_each_entry(sp, &spaces, list) { if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; _set_space_info(sp, spi); return 0; } return -1; } int lockspace_info(const char *space_name, struct space_info *spi) { int rv; pthread_mutex_lock(&spaces_mutex); rv = _lockspace_info(space_name, spi); pthread_mutex_unlock(&spaces_mutex); return rv; } int lockspace_disk(char *space_name, struct sync_disk *disk, int *sector_size) { struct space *sp; int rv = -1; pthread_mutex_lock(&spaces_mutex); list_for_each_entry(sp, &spaces, list) { if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; memcpy(disk, &sp->host_id_disk, sizeof(struct sync_disk)); *sector_size = sp->sector_size; disk->fd = -1; rv = 0; } pthread_mutex_unlock(&spaces_mutex); return rv; } #if 0 static void clear_bit(int host_id, char *bitmap) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = host_id % 8; *byte &= ~bit; } #endif void set_id_bit(int host_id, char *bitmap, char *c) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = (host_id - 1) % 8; char mask; mask = 1 << bit; *byte |= mask; if (c) *c = *byte; } /* FIXME: another copy in direct_lib.c */ int test_id_bit(int host_id, char *bitmap) { char *byte = bitmap + ((host_id - 1) / 8); unsigned int bit = (host_id - 1) % 8; char mask; mask = 1 << bit; return (*byte & mask); } int host_status_set_bit(char *space_name, uint64_t host_id) { struct space *sp; int found = 0; if (!host_id || host_id > DEFAULT_MAX_HOSTS) return -EINVAL; pthread_mutex_lock(&spaces_mutex); list_for_each_entry(sp, &spaces, list) { if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; found = 1; break; } pthread_mutex_unlock(&spaces_mutex); if (!found) return -ENOSPC; if (host_id > sp->max_hosts) return -EINVAL; pthread_mutex_lock(&sp->mutex); sp->host_status[host_id-1].set_bit_time = monotime(); pthread_mutex_unlock(&sp->mutex); return 0; } int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out) { struct space *sp; int found = 0; int toobig = 0; if (!host_id || host_id > DEFAULT_MAX_HOSTS) return -EINVAL; pthread_mutex_lock(&spaces_mutex); list_for_each_entry(sp, &spaces, list) { if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; if (host_id > sp->max_hosts) { toobig = 1; break; } memcpy(hs_out, &sp->host_status[host_id-1], sizeof(struct host_status)); found = 1; if (!hs_out->io_timeout) { log_erros(sp, "host_info %llu use own io_timeout %d", (unsigned long long)host_id, sp->io_timeout); hs_out->io_timeout = sp->io_timeout; } break; } pthread_mutex_unlock(&spaces_mutex); if (toobig) return -EINVAL; if (!found) return -ENOSPC; return 0; } static void create_bitmap_and_extra(struct space *sp, char *bitmap, struct delta_extra *extra) { uint64_t now; int i; char c; now = monotime(); pthread_mutex_lock(&sp->mutex); for (i = 0; i < sp->max_hosts; i++) { if (i+1 == sp->host_id) continue; if (!sp->host_status[i].set_bit_time) continue; if (now - sp->host_status[i].set_bit_time > sp->set_bitmap_seconds) { /* log_space(sp, "bitmap clear host_id %d", i+1); */ sp->host_status[i].set_bit_time = 0; } else { set_id_bit(i+1, bitmap, &c); /* log_space(sp, "bitmap set host_id %d byte %x", i+1, c); */ } } extra->field1 = sp->host_event.generation; extra->field2 = sp->host_event.event; extra->field3 = sp->host_event.data; pthread_mutex_unlock(&sp->mutex); } /* * Called from main thread to look through the lease data collected in * the last renewal. Records liveness history about other hosts in the * lockspace, checks if another host is notifying us (through their bitmap) * to look at resource requests or an event they've written. * * NB. the way that this gets the copy of all leases to look at is * unfortunately very subtle and convoluted. * * The lockspace thread makes a copy of task iobuf, which holds all * delta leases that were read in the last renewal, into * sp->lease_status.renewal_read_buf. Then check_our_lease() called * by the main loop makes a copy of sp->lease_status.renewal_read_buf * to pass to this function. */ void check_other_leases(struct space *sp, char *buf) { struct leader_record leader_in; struct leader_record *leader_end; struct leader_record *leader; struct host_status *hs; struct sanlk_host_event he; char *bitmap; uint64_t now; int i, new; now = monotime(); new = 0; for (i = 0; i < sp->max_hosts; i++) { hs = &sp->host_status[i]; hs->last_check = now; if (!hs->first_check) hs->first_check = now; leader_end = (struct leader_record *)(buf + (i * sp->sector_size)); leader_record_in(leader_end, &leader_in); leader = &leader_in; /* * If this lease has invalid fields, log an error. Limit the logging * frequency to avoid blowing up logs if some lease is left in a bad * state for a long time. Should we check other fields in addition * to magic and space_name? */ if ((leader->magic != DELTA_DISK_MAGIC) || (strncmp(leader->space_name, sp->space_name, NAME_ID_SIZE))) { if (!hs->lease_bad || !(hs->lease_bad % 100)) { log_erros(sp, "check_other_lease invalid for host %llu %llu ts %llu name %.48s in %.48s", (unsigned long long)hs->owner_id, (unsigned long long)hs->owner_generation, (unsigned long long)hs->timestamp, hs->owner_name, sp->space_name); log_erros(sp, "check_other_lease leader %x owner %llu %llu ts %llu sn %.48s rn %.48s", leader->magic, (unsigned long long)leader->owner_id, (unsigned long long)leader->owner_generation, (unsigned long long)leader->timestamp, leader->space_name, leader->resource_name); } hs->lease_bad++; if (!hs->lease_bad) hs->lease_bad++; } else { if (hs->lease_bad) { log_erros(sp, "check_other_lease corrected for host %llu %llu ts %llu name %.48s in %.48s", (unsigned long long)hs->owner_id, (unsigned long long)hs->owner_generation, (unsigned long long)hs->timestamp, hs->owner_name, sp->space_name); } hs->lease_bad = 0; } /* * Save a record of each new host instance to help with debugging. */ if (!hs->lease_bad && (strncmp(hs->owner_name, leader->resource_name, NAME_ID_SIZE) || (hs->owner_generation != leader->owner_generation))) { log_warns(sp, "host %llu %llu %llu %.48s", (unsigned long long)leader->owner_id, (unsigned long long)leader->owner_generation, (unsigned long long)leader->timestamp, leader->resource_name); strncpy(hs->owner_name, leader->resource_name, NAME_ID_SIZE); } if (hs->owner_id == leader->owner_id && hs->owner_generation == leader->owner_generation && hs->timestamp == leader->timestamp) { continue; } /* * Replacing good values with potentially bad values * would have no purpose, and would confuse things, so * don't replace these fields if the lease is bad. * But, continue to update the timestamp because we don't * want to consider the host to be dead if the lease * is being renewed, even if the lease has bad fields. */ if (!hs->lease_bad) { hs->owner_id = leader->owner_id; hs->owner_generation = leader->owner_generation; strncpy(hs->owner_name, leader->resource_name, NAME_ID_SIZE); hs->io_timeout = leader->io_timeout; } hs->timestamp = leader->timestamp; hs->last_live = now; if (i+1 == sp->host_id) continue; bitmap = (char *)leader_end + HOSTID_BITMAP_OFFSET; if (!test_id_bit(sp->host_id, bitmap)) continue; /* * Our bit is set in the bitmap, so this host is * notifying us of a host_event or resource request. */ memset(&he, 0, sizeof(he)); he.host_id = sp->host_id; he.generation = leader->write_id; he.event = leader->write_generation; he.data = leader->write_timestamp; /* * Pass an event to the resource_thread which is a * convenient place to do callbacks (we don't want * the main thread to be delayed with that.) */ if (he.event) { /* * lock order: spaces_mutex (main_loop), then * resource_mutex (add_host_event). */ log_space(sp, "host event from host_id %d", i+1); add_host_event(sp->space_id, &he, hs->owner_id, hs->owner_generation); } /* this host has made a resource request for us, we won't take a new request from this host for another set_bitmap_seconds */ if (now - hs->last_req < sp->set_bitmap_seconds) continue; log_space(sp, "request from host_id %d", i+1); hs->last_req = now; new = 1; } /* * Have the resource_thread check the request records of resources * in this lockspace. */ if (new) set_resource_examine(sp->space_name, NULL); } /* * check if our_host_id_thread has renewed within timeout */ int check_our_lease(struct space *sp, int *check_all, char *check_buf) { int id_renewal_fail_seconds, id_renewal_warn_seconds; uint64_t last_success; int corrupt_result; int gap; pthread_mutex_lock(&sp->mutex); last_success = sp->lease_status.renewal_last_success; corrupt_result = sp->lease_status.corrupt_result; if (sp->lease_status.renewal_read_count > sp->lease_status.renewal_read_check) { /* * NB. it's unfortunate how subtle this is. * main loop will pass this buf to check_other_leases next */ sp->lease_status.renewal_read_check = sp->lease_status.renewal_read_count; *check_all = 1; if (check_buf) memcpy(check_buf, sp->lease_status.renewal_read_buf, sp->align_size); } pthread_mutex_unlock(&sp->mutex); if (corrupt_result) { log_erros(sp, "check_our_lease corrupt %d", corrupt_result); return -1; } gap = monotime() - last_success; id_renewal_fail_seconds = calc_id_renewal_fail_seconds(sp->io_timeout); id_renewal_warn_seconds = calc_id_renewal_warn_seconds(sp->io_timeout); if (gap >= id_renewal_fail_seconds) { log_erros(sp, "check_our_lease failed %d", gap); return -1; } if (gap >= id_renewal_warn_seconds) { log_erros(sp, "check_our_lease warning %d last_success %llu", gap, (unsigned long long)last_success); } if (com.debug_renew > 1) { log_space(sp, "check_our_lease good %d %llu", gap, (unsigned long long)last_success); } return 0; } /* If a renewal result is one of the listed errors, it means our delta lease has been corrupted/overwritten/reinitialized out from under us, and we should stop using it immediately. There's no point in retrying the renewal. */ static int corrupt_result(int result) { switch (result) { case SANLK_RENEW_OWNER: case SANLK_RENEW_DIFF: case SANLK_LEADER_MAGIC: case SANLK_LEADER_VERSION: case SANLK_LEADER_SECTORSIZE: case SANLK_LEADER_LOCKSPACE: case SANLK_LEADER_CHECKSUM: return result; default: return 0; } } static void close_event_fds(struct space *sp) { int i; pthread_mutex_lock(&sp->mutex); for (i = 0; i < MAX_EVENT_FDS; i++) { if (sp->event_fds[i] == -1) continue; close(sp->event_fds[i]); sp->event_fds[i] = -1; } pthread_mutex_unlock(&sp->mutex); } /* * if delta_result is success: * new record saving last_success (timestamp in renewal), rd_ms, wr_ms * if delta_result is timeout: * increment next_timeouts in prev record * if delta_result is failure: * increment next_errors in prev record */ static void save_renewal_history(struct space *sp, int delta_result, uint64_t last_success, int rd_ms, int wr_ms) { struct renewal_history *hi; if (!sp->renewal_history_size || !sp->renewal_history) return; if (delta_result == SANLK_OK) { hi = &sp->renewal_history[sp->renewal_history_next]; hi->timestamp = last_success; hi->read_ms = rd_ms; hi->write_ms = wr_ms; sp->renewal_history_prev = sp->renewal_history_next; sp->renewal_history_next++; if (sp->renewal_history_next >= sp->renewal_history_size) sp->renewal_history_next = 0; } else { hi = &sp->renewal_history[sp->renewal_history_prev]; if (delta_result == SANLK_AIO_TIMEOUT) hi->next_timeouts++; else hi->next_errors++; } } #define ONE_MB_IN_BYTES 1048576 #define ONE_MB_IN_KB 1024 static void set_lockspace_max_sectors_kb(struct space *sp, int sector_size, int align_size) { struct stat st; int align_size_kb = align_size / 1024; /* align_size is in bytes */ unsigned int hw_kb = 0; unsigned int set_kb = 0; int rv; if (fstat(sp->host_id_disk.fd, &st) < 0) { log_erros(sp, "set_lockspace_max_sectors_kb fstat error %d", errno); return; } /* file not device */ if (S_ISREG(st.st_mode)) return; if (com.max_sectors_kb_ignore) return; else if (com.max_sectors_kb_align) set_kb = align_size_kb; else if (com.max_sectors_kb_num) set_kb = com.max_sectors_kb_num; else return; rv = read_sysfs_size(sp->host_id_disk.path, "max_hw_sectors_kb", &hw_kb); if (rv < 0 || !hw_kb) { log_space(sp, "set_lockspace_max_sectors_kb max_hw_sectors_kb unknown %d %u", rv, hw_kb); return; } if (hw_kb < set_kb) { /* * If the hardware won't support requested size, try setting 1MB. */ if (hw_kb < ONE_MB_IN_KB) { log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u req_kb %u", hw_kb, set_kb); return; } if (set_kb < 1024) { log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u small req_kb %u", hw_kb, set_kb); return; } set_kb = ONE_MB_IN_KB; log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u using 1024", hw_kb); rv = set_max_sectors_kb(&sp->host_id_disk, set_kb); if (rv < 0) { log_space(sp, "set_lockspace_max_sectors_kb small hw_kb %u set 1024 error %d", hw_kb, rv); return; } } else { /* * Tell the kernel to send hardware io's as large as the lease size. */ log_space(sp, "set_lockspace_max_sectors_kb hw_kb %u setting %u", hw_kb, set_kb); rv = set_max_sectors_kb(&sp->host_id_disk, set_kb); if (rv < 0) { log_space(sp, "set_lockspace_max_sectors_kb hw_kb %u set %u error %d", hw_kb, set_kb, rv); return; } } } /* * This thread must not be stopped unless all pids that may be using any * resources in it are dead/gone. (The USED flag in the lockspace represents * pids using resources in the lockspace, when those pids are not using actual * sanlock resources. So the USED flag must also prevent this thread from * stopping.) */ static void *lockspace_thread(void *arg_in) { char bitmap[HOSTID_BITMAP_SIZE]; struct delta_extra extra; struct task task; struct space *sp; struct leader_record leader; uint64_t delta_begin, last_success = 0; int sector_size = 0; int align_size = 0; int max_hosts = 0; int log_renewal_level = -1; int rv, delta_length, renewal_interval = 0; int id_renewal_seconds, id_renewal_fail_seconds; int acquire_result, delta_result, read_result; int rd_ms, wr_ms; int opened = 0; int stop = 0; int wd_con; if (com.debug_renew) log_renewal_level = LOG_DEBUG; sp = (struct space *)arg_in; memset(&task, 0, sizeof(struct task)); setup_task_aio(&task, main_task.use_aio, HOSTID_AIO_CB_SIZE); memcpy(task.name, sp->space_name, NAME_ID_SIZE); id_renewal_seconds = calc_id_renewal_seconds(sp->io_timeout); id_renewal_fail_seconds = calc_id_renewal_fail_seconds(sp->io_timeout); delta_begin = monotime(); rv = open_disk(&sp->host_id_disk); if (rv < 0) { log_erros(sp, "open_disk %s error %d", sp->host_id_disk.path, rv); acquire_result = -ENODEV; delta_result = -1; goto set_status; } opened = 1; rv = delta_read_lockspace_sizes(&task, &sp->host_id_disk, sp->io_timeout, §or_size, &align_size); if (rv < 0) { log_erros(sp, "failed to read device to find sector size error %d %s", rv, sp->host_id_disk.path); acquire_result = rv; delta_result = -1; goto set_status; } if ((sector_size != 512) && (sector_size != 4096)) { log_erros(sp, "failed to get valid sector size %d %s", sector_size, sp->host_id_disk.path); acquire_result = SANLK_LEADER_SECTORSIZE; delta_result = -1; goto set_status; } max_hosts = size_to_max_hosts(sector_size, align_size); if (!max_hosts) { log_erros(sp, "invalid combination of sector size %d and align_size %d", sector_size, align_size); acquire_result = SANLK_ADDLS_SIZES; delta_result = -1; goto set_status; } if (sp->host_id > max_hosts) { log_erros(sp, "host_id %llu too large for max_hosts %d", (unsigned long long)sp->host_id, max_hosts); acquire_result = SANLK_ADDLS_INVALID_HOSTID; delta_result = -1; goto set_status; } sp->sector_size = sector_size; sp->align_size = align_size; sp->max_hosts = max_hosts; set_lockspace_max_sectors_kb(sp, sector_size, align_size); sp->lease_status.renewal_read_buf = malloc(sp->align_size); if (!sp->lease_status.renewal_read_buf) { acquire_result = -ENOMEM; delta_result = -1; goto set_status; } /* Connect first so we can fail quickly if wdmd is not running. */ wd_con = connect_watchdog(sp); if (wd_con < 0) { log_erros(sp, "connect_watchdog failed %d", wd_con); acquire_result = SANLK_WD_ERROR; delta_result = -1; goto set_status; } /* * acquire the delta lease */ delta_begin = monotime(); delta_result = delta_lease_acquire(&task, sp, &sp->host_id_disk, sp->space_name, our_host_name_global, sp->host_id, &leader); delta_length = monotime() - delta_begin; if (delta_result == SANLK_OK) last_success = leader.timestamp; acquire_result = delta_result; /* we need to start the watchdog after we acquire the host_id but before we allow any pid's to begin running */ if (delta_result == SANLK_OK) { rv = activate_watchdog(sp, last_success, id_renewal_fail_seconds, wd_con); if (rv < 0) { log_erros(sp, "activate_watchdog failed %d", rv); acquire_result = SANLK_WD_ERROR; } } else { if (com.use_watchdog) close(wd_con); } set_status: pthread_mutex_lock(&sp->mutex); sp->lease_status.acquire_last_result = acquire_result; sp->lease_status.acquire_last_attempt = delta_begin; if (delta_result == SANLK_OK) sp->lease_status.acquire_last_success = last_success; sp->lease_status.renewal_last_result = acquire_result; sp->lease_status.renewal_last_attempt = delta_begin; if (delta_result == SANLK_OK) sp->lease_status.renewal_last_success = last_success; /* First renewal entry shows the acquire time with 0 latencies. */ save_renewal_history(sp, delta_result, last_success, 0, 0); pthread_mutex_unlock(&sp->mutex); if (acquire_result < 0) goto out; sp->host_generation = leader.owner_generation; while (1) { pthread_mutex_lock(&sp->mutex); stop = sp->thread_stop; pthread_mutex_unlock(&sp->mutex); if (stop) break; /* * wait between each renewal */ if (monotime() - last_success < id_renewal_seconds) { sleep(1); continue; } else { /* don't spin too quickly if renew is failing immediately and repeatedly */ usleep(500000); } /* * do a renewal, measuring length of time spent in renewal, * and the length of time between successful renewals */ memset(bitmap, 0, sizeof(bitmap)); memset(&extra, 0, sizeof(extra)); create_bitmap_and_extra(sp, bitmap, &extra); delta_begin = monotime(); delta_result = delta_lease_renew(&task, sp, &sp->host_id_disk, sp->space_name, bitmap, &extra, delta_result, &read_result, log_renewal_level, &leader, &leader, &rd_ms, &wr_ms); delta_length = monotime() - delta_begin; if (delta_result == SANLK_OK) { renewal_interval = leader.timestamp - last_success; last_success = leader.timestamp; } /* * publish the results */ pthread_mutex_lock(&sp->mutex); sp->lease_status.renewal_last_result = delta_result; sp->lease_status.renewal_last_attempt = delta_begin; if (delta_result == SANLK_OK) sp->lease_status.renewal_last_success = last_success; if (delta_result != SANLK_OK && !sp->lease_status.corrupt_result) sp->lease_status.corrupt_result = corrupt_result(delta_result); if (read_result == SANLK_OK && task.iobuf) { /* NB. be careful with how this iobuf escapes */ memcpy(sp->lease_status.renewal_read_buf, task.iobuf, sp->align_size); sp->lease_status.renewal_read_count++; } /* * pet the watchdog * (don't update on thread_stop because it's probably unlinked) */ if (delta_result == SANLK_OK && !sp->thread_stop) update_watchdog(sp, last_success, id_renewal_fail_seconds); save_renewal_history(sp, delta_result, last_success, rd_ms, wr_ms); pthread_mutex_unlock(&sp->mutex); /* * log the results */ if (delta_result != SANLK_OK) { log_erros(sp, "renewal error %d delta_length %d last_success %llu", delta_result, delta_length, (unsigned long long)last_success); } else if (delta_length > id_renewal_seconds) { log_erros(sp, "renewed %llu delta_length %d too long", (unsigned long long)last_success, delta_length); } else { if (com.debug_renew) { log_space(sp, "renewed %llu delta_length %d interval %d", (unsigned long long)last_success, delta_length, renewal_interval); } } } /* watchdog unlink was done in main_loop when thread_stop was set, to get it done as quickly as possible in case the wd is about to fire. */ close_watchdog(sp); out: if (delta_result == SANLK_OK) delta_lease_release(&task, sp, &sp->host_id_disk, sp->space_name, &leader, &leader); if (opened) close(sp->host_id_disk.fd); /* * TODO: are there cases where struct resources for this lockspace * still exist on resource_held/resource_add/resource_rem? Is that ok? * Should we purge all of them here? When a lockspace is removed and * pids are killed, their resources go through release_token_async, * which will see token->space_dead, and those resources are freed * directly. resources that may have already been on resources_rem and * the resource_thread may be in the middle of releasing one of them. * For any further async releases, resource_thread will see that the * lockspace is going away and will just free the resource. */ purge_resource_orphans(sp->space_name); purge_resource_free(sp->space_name); close_event_fds(sp); close_task_aio(&task); return NULL; } static void free_sp(struct space *sp) { if (sp->lease_status.renewal_read_buf) free(sp->lease_status.renewal_read_buf); free(sp); } int add_lockspace_start(struct sanlk_lockspace *ls, uint32_t io_timeout, struct space **sp_out) { struct space *sp, *sp2; int listnum = 0; int rv; int i; if (!ls->name[0] || !ls->host_id || !ls->host_id_disk.path[0]) { log_error("add_lockspace bad args id %llu name %zu path %zu", (unsigned long long)ls->host_id, strlen(ls->name), strlen(ls->host_id_disk.path)); return -EINVAL; } sp = malloc(sizeof(struct space)); if (!sp) return -ENOMEM; memset(sp, 0, sizeof(struct space)); memcpy(sp->space_name, ls->name, NAME_ID_SIZE); memcpy(&sp->host_id_disk, &ls->host_id_disk, sizeof(struct sanlk_disk)); sp->host_id_disk.sector_size = 0; sp->host_id_disk.fd = -1; sp->host_id = ls->host_id; sp->io_timeout = io_timeout; sp->set_bitmap_seconds = calc_set_bitmap_seconds(io_timeout); pthread_mutex_init(&sp->mutex, NULL); if (com.renewal_read_extend_sec_set) sp->renewal_read_extend_sec = com.renewal_read_extend_sec; else sp->renewal_read_extend_sec = io_timeout; for (i = 0; i < MAX_EVENT_FDS; i++) sp->event_fds[i] = -1; if (com.renewal_history_size) { sp->renewal_history = malloc(sizeof(struct renewal_history) * com.renewal_history_size); if (sp->renewal_history) { sp->renewal_history_size = com.renewal_history_size; memset(sp->renewal_history, 0, sizeof(struct renewal_history) * com.renewal_history_size); } } pthread_mutex_lock(&spaces_mutex); /* search all lists for an identical lockspace */ sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id, &spaces, NULL, NULL, NULL); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EEXIST; goto fail_free; } sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id, &spaces_add, NULL, NULL, NULL); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EINPROGRESS; goto fail_free; } sp2 = _search_space(sp->space_name, &sp->host_id_disk, sp->host_id, &spaces_rem, NULL, NULL, NULL); if (sp2) { pthread_mutex_unlock(&spaces_mutex); rv = -EAGAIN; goto fail_free; } /* search all lists for a lockspace with the same name */ sp2 = _search_space(sp->space_name, NULL, 0, &spaces, &spaces_add, &spaces_rem, &listnum); if (sp2) { log_error("add_lockspace %.48s:%llu:%.256s:%llu conflicts with name of list%d s%d %.48s:%llu:%.256s:%llu", sp->space_name, (unsigned long long)sp->host_id, sp->host_id_disk.path, (unsigned long long)sp->host_id_disk.offset, listnum, sp2->space_id, sp2->space_name, (unsigned long long)sp2->host_id, sp2->host_id_disk.path, (unsigned long long)sp2->host_id_disk.offset); pthread_mutex_unlock(&spaces_mutex); rv = -EINVAL; goto fail_free; } /* search all lists for a lockspace with the same host_id_disk */ sp2 = _search_space(NULL, &sp->host_id_disk, 0, &spaces, &spaces_add, &spaces_rem, &listnum); if (sp2) { log_error("add_lockspace %.48s:%llu:%.256s:%llu conflicts with path of list%d s%d %.48s:%llu:%.256s:%llu", sp->space_name, (unsigned long long)sp->host_id, sp->host_id_disk.path, (unsigned long long)sp->host_id_disk.offset, listnum, sp2->space_id, sp2->space_name, (unsigned long long)sp2->host_id, sp2->host_id_disk.path, (unsigned long long)sp2->host_id_disk.offset); pthread_mutex_unlock(&spaces_mutex); rv = -EINVAL; goto fail_free; } sp->space_id = space_id_counter++; list_add(&sp->list, &spaces_add); pthread_mutex_unlock(&spaces_mutex); /* save a record of what this space_id is for later debugging */ log_warns(sp, "lockspace %.48s:%llu:%.256s:%llu", sp->space_name, (unsigned long long)sp->host_id, sp->host_id_disk.path, (unsigned long long)sp->host_id_disk.offset); rv = pthread_create(&sp->thread, NULL, lockspace_thread, sp); if (rv < 0) { log_erros(sp, "add_lockspace create thread failed"); goto fail_del; } *sp_out = sp; return 0; fail_del: pthread_mutex_lock(&spaces_mutex); list_del(&sp->list); pthread_mutex_unlock(&spaces_mutex); fail_free: free_sp(sp); return rv; } int add_lockspace_wait(struct space *sp) { int rv, result; while (1) { pthread_mutex_lock(&sp->mutex); result = sp->lease_status.acquire_last_result; pthread_mutex_unlock(&sp->mutex); if (result) break; sleep(1); } if (result != SANLK_OK) { /* the thread exits right away if acquire fails */ pthread_join(sp->thread, NULL); rv = result; log_erros(sp, "add_lockspace fail result %d", result); goto fail_del; } /* Once we move sp to spaces list, tokens can begin using it, the main loop will begin monitoring its renewals, and will handle removing it. */ pthread_mutex_lock(&spaces_mutex); if (sp->external_remove || external_shutdown) { pthread_mutex_unlock(&spaces_mutex); log_space(sp, "add_lockspace undo remove %d shutdown %d", sp->external_remove, external_shutdown); /* We've caught a remove/shutdown just before completing the add process. Don't complete it, but reverse the add, leaving the sp on spaces_add while reversing. Do the same thing that main_loop would do, except we don't have to go through killing_pids and checking for all_pids_dead since this lockspace has never been on the spaces list, so it could not have been used yet. */ pthread_mutex_lock(&sp->mutex); sp->thread_stop = 1; deactivate_watchdog(sp); pthread_mutex_unlock(&sp->mutex); pthread_join(sp->thread, NULL); rv = -1; log_space(sp, "add_lockspace undo complete"); goto fail_del; } else { list_move(&sp->list, &spaces); log_space(sp, "add_lockspace done"); pthread_mutex_unlock(&spaces_mutex); return 0; } fail_del: pthread_mutex_lock(&spaces_mutex); list_del(&sp->list); pthread_mutex_unlock(&spaces_mutex); free_sp(sp); return rv; } int inq_lockspace(struct sanlk_lockspace *ls) { int rv; struct space *sp; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces, NULL, NULL, NULL); if (sp) { rv = 0; goto out; } else { rv = -ENOENT; } sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces_add, &spaces_rem, NULL, NULL); if (sp) rv = -EINPROGRESS; out: pthread_mutex_unlock(&spaces_mutex); return rv; } int rem_lockspace_start(struct sanlk_lockspace *ls, unsigned int *space_id) { struct space *sp; unsigned int id; int rv; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces_rem, NULL, NULL, NULL); if (sp) { pthread_mutex_unlock(&spaces_mutex); rv = -EINPROGRESS; goto out; } sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces_add, NULL, NULL, NULL); if (sp) { /* add_lockspace will be aborted and undone and the sp will not be moved to the spaces list */ sp->external_remove = 1; id = sp->space_id; pthread_mutex_unlock(&spaces_mutex); *space_id = id; rv = 0; goto out; } sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces, NULL, NULL, NULL); if (!sp) { pthread_mutex_unlock(&spaces_mutex); rv = -ENOENT; goto out; } if (sp->rindex_op) { log_space(sp, "rem_lockspace ignored for rindex_op %d", sp->rindex_op); pthread_mutex_unlock(&spaces_mutex); rv = -EBUSY; goto out; } /* * Removal happens in a round about way: * - we set external_remove * - main_loop sees external_remove and sets space_dead, killing_pids * - main_loop sees killing_pids and all pids dead, sets thread_stop, * and moves sp from spaces to spaces_rem * - main_loop calls free_lockspaces(0), which joins any * lockspace_thread that is done, and then frees sp * * Once we release spaces_mutex, the sp could be freed any time, * so we can't touch it. Use its space_id to check for completion. */ sp->external_remove = 1; id = sp->space_id; pthread_mutex_unlock(&spaces_mutex); *space_id = id; rv = 0; out: return rv; } /* check for matching space_id in case the lockspace is added again */ int rem_lockspace_wait(struct sanlk_lockspace *ls, unsigned int space_id) { struct space *sp; int done; while (1) { pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, (struct sync_disk *)&ls->host_id_disk, ls->host_id, &spaces, &spaces_rem, &spaces_add, NULL); if (sp && (sp->space_id == space_id)) done = 0; else done = 1; pthread_mutex_unlock(&spaces_mutex); if (done) break; sleep(1); } return 0; } int get_lockspaces(char *buf, int *len, int *count, int maxlen) { struct sanlk_lockspace *ls; struct space *sp; struct list_head *heads[] = {&spaces, &spaces_rem, &spaces_add}; int i, rv, sp_count = 0; rv = 0; *len = 0; *count = 0; ls = (struct sanlk_lockspace *)buf; pthread_mutex_lock(&spaces_mutex); for (i = 0; i < 3; i++) { list_for_each_entry(sp, heads[i], list) { sp_count++; if (*len + sizeof(struct sanlk_lockspace) > maxlen) { rv = -ENOSPC; continue; } memcpy(ls->name, sp->space_name, NAME_ID_SIZE); memcpy(&ls->host_id_disk, &sp->host_id_disk, sizeof(struct sync_disk)); ls->host_id_disk.pad1 = 0; ls->host_id_disk.pad2 = 0; ls->host_id = sp->host_id; ls->flags = 0; if (i == 1) ls->flags |= SANLK_LSF_REM; else if (i == 2) ls->flags |= SANLK_LSF_ADD; *len += sizeof(struct sanlk_lockspace); ls++; } } pthread_mutex_unlock(&spaces_mutex); *count = sp_count; return rv; } /* * After the lockspace starts, there is a limited amount of * time that we've been watching the other hosts. This means * we can't make an accurate assessment of their state, because * the state is based on monitoring the hosts for host_fail_seconds * and host_dead_seconds, or seeing a renewal. When none of * those are true (not enough time monitoring and not seeing a * renewal), we return UNKNOWN. * * (Example number of seconds below are based on hosts using the * default 10 second io timeout.) * * * For hosts that are alive when we start, we return: * UNKNOWN then LIVE * * UNKNOWN would typically last for 10-20 seconds, but it's possible that * UNKNOWN could persist for up to 80 seconds before LIVE is returned. * LIVE is returned after we see the timestamp change once. * * * For hosts that are dead when we start, we'd return: * UNKNOWN then FAIL then DEAD * * UNKNOWN would last for 80 seconds before we return FAIL. * FAIL would last for 60 more seconds before we return DEAD. * * * Hosts that are failing and don't recover would be the same as prev. * * * For hosts thet are failing but recover, we'd return: * UNKNOWN then FAIL then LIVE * * * For another host that is alive when we start, * the sequence of values is: * * 0: we have not yet called check_other_leases() * first_check = 0, last_check = 0, last_live = 0 * * other host renews its lease * * 10: we call check_other_leases() for the first time, * first_check = 10, last_check = 10, last_live = 10 * * other host renews its lease * * 20: we call check_other_leases() for the second time, * first_check = 10, last_check = 20, last_live = 20 * * At 10, we have not yet seen a renewal from the other host, i.e. we have * not seen its timestamp change (we only have one sample). The host could * be dead or alive, so we set the state to UNKNOWN. The way we know * that we have not yet observed the timestamp change is that * first_check == last_live, (10 == 10). * * At 20, we have seen a renewal, i.e. the timestamp changed between checks, * so we return LIVE. * * In the other case, if the host was actually dead, not alive, it would not * have renewed between 10 and 20. So at 20 we would continue to see * first_check == last_live, and would return UNKNOWN. If the host remains * dead, we'd continue to report UNKNOWN for the first 80 seconds. * After 80 seconds, we'd return FAIL. After 140 seconds we'd return DEAD. */ /* Also see host_live() */ static uint32_t get_host_flag(struct space *sp, struct host_status *hs) { uint64_t now, last; uint32_t flags; uint32_t other_io_timeout; int other_host_fail_seconds, other_host_dead_seconds; now = monotime(); other_io_timeout = hs->io_timeout; other_host_fail_seconds = calc_id_renewal_fail_seconds(other_io_timeout); other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout); flags = 0; if (!hs->timestamp) { flags = SANLK_HOST_FREE; goto out; } if (!hs->last_live) last = hs->first_check; else last = hs->last_live; if (sp->host_id == hs->owner_id) { /* we are alive */ flags = SANLK_HOST_LIVE; } else if ((now - last <= other_host_fail_seconds) && (hs->first_check == hs->last_live)) { /* we haven't seen the timestamp change yet */ flags = SANLK_HOST_UNKNOWN; } else if (now - last <= other_host_fail_seconds) { flags = SANLK_HOST_LIVE; } else if (now - last > other_host_dead_seconds) { flags = SANLK_HOST_DEAD; } else if (now - last > other_host_fail_seconds) { flags = SANLK_HOST_FAIL; } out: return flags; } int get_hosts(struct sanlk_lockspace *ls, char *buf, int *len, int *count, int maxlen) { struct space *sp; struct host_status *hs; struct sanlk_host *host; int host_count = 0; int i, rv; rv = 0; *len = 0; *count = 0; host = (struct sanlk_host *)buf; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, NULL, 0, &spaces, NULL, NULL, NULL); if (!sp) { rv = -ENOENT; goto out; } /* * Between add_lockspace completing and the first * time we call check_other_leases, we don't have * any data on other hosts, so return this error * to indicate this to the caller. */ if (!sp->host_status[0].last_check) { rv = -EAGAIN; goto out; } for (i = 0; i < sp->max_hosts; i++) { hs = &sp->host_status[i]; if (ls->host_id && (ls->host_id != (i + 1))) continue; if (!ls->host_id && !hs->timestamp) continue; host_count++; if (*len + sizeof(struct sanlk_host) > maxlen) { rv = -ENOSPC; continue; } host->host_id = i + 1; host->generation = hs->owner_generation; host->timestamp = hs->timestamp; host->io_timeout = hs->io_timeout; host->flags = get_host_flag(sp, hs); *len += sizeof(struct sanlk_host); host++; } out: pthread_mutex_unlock(&spaces_mutex); *count = host_count; return rv; } int lockspace_set_config(struct sanlk_lockspace *ls, GNUC_UNUSED uint32_t flags, uint32_t cmd) { struct space *sp; int rv; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, NULL, 0, &spaces, NULL, NULL, NULL); if (!sp) { pthread_mutex_unlock(&spaces_mutex); rv = -ENOENT; goto out; } pthread_mutex_unlock(&spaces_mutex); pthread_mutex_lock(&sp->mutex); switch (cmd) { case SANLK_CONFIG_USED: if (sp->space_dead) { rv = -ENOSPC; } else { sp->flags |= SP_EXTERNAL_USED; rv = 0; } break; case SANLK_CONFIG_UNUSED: sp->flags &= ~SP_EXTERNAL_USED; rv = 0; break; case SANLK_CONFIG_USED_BY_ORPHANS: if (sp->space_dead) { rv = -ENOSPC; } else { sp->flags |= SP_USED_BY_ORPHANS; rv = 0; } break; case SANLK_CONFIG_UNUSED_BY_ORPHANS: sp->flags &= ~SP_USED_BY_ORPHANS; rv = 0; break; default: rv = -EINVAL; } pthread_mutex_unlock(&sp->mutex); out: return rv; } int lockspace_begin_rindex_op(char *space_name, int rindex_op, struct space_info *spi) { struct space *sp; int rv = 0; pthread_mutex_lock(&spaces_mutex); sp = _search_space(space_name, NULL, 0, &spaces, NULL, NULL, NULL); if (!sp) { rv = -ENOENT; goto out; } /* space_dead and thread_stop are only set while spaces_mutex is held, so we don't need to lock sp->mutex */ if (sp->space_dead || sp->thread_stop) { rv = -ENOSPC; goto out; } if (sp->rindex_op) { log_debug("being_rindex_op busy with %d", sp->rindex_op); rv = -EBUSY; goto out; } sp->rindex_op = rindex_op; _set_space_info(sp, spi); out: pthread_mutex_unlock(&spaces_mutex); return rv; } int lockspace_clear_rindex_op(char *space_name) { struct space *sp; int rv = 0; pthread_mutex_lock(&spaces_mutex); sp = _search_space(space_name, NULL, 0, &spaces, NULL, NULL, NULL); if (!sp) rv = -ENOENT; else sp->rindex_op = 0; pthread_mutex_unlock(&spaces_mutex); return rv; } static int _clean_event_fds(struct space *sp) { uint32_t end; int count = 0; int old_fd; int i, rv; pthread_mutex_lock(&sp->mutex); for (i = 0; i < MAX_EVENT_FDS; i++) { if (sp->event_fds[i] == -1) continue; old_fd = sp->event_fds[i]; rv = recv(old_fd, &end, sizeof(end), MSG_DONTWAIT); if (rv == -1 && errno == EAGAIN) continue; if ((rv == sizeof(end)) && (end != 1)) { log_erros(sp, "clean_event_fds ignore end value %u on event fd %d", end, old_fd); continue; } if ((rv == sizeof(end)) || !rv || (rv < 0)) log_space(sp, "clean_event_fds unregister event fd %d recv %d", old_fd, rv); else log_erros(sp, "clean_event_fds close event fd %d recv %d", old_fd, rv); close(old_fd); sp->event_fds[i] = -1; count++; } pthread_mutex_unlock(&sp->mutex); return count; } /* * end_event/reg_event don't need to worry about sp being freed * beause the main daemon thread processes end_event/reg_event, * and the main thread is also the only thread that will free * sp structs. */ int lockspace_end_event(struct sanlk_lockspace *ls) { struct space *sp; if (!ls->name[0]) return -EINVAL; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, NULL, 0, &spaces, NULL, NULL, NULL); pthread_mutex_unlock(&spaces_mutex); if (!sp) return -ENOENT; _clean_event_fds(sp); return 0; } int lockspace_reg_event(struct sanlk_lockspace *ls, int fd, GNUC_UNUSED uint32_t flags) { struct space *sp; int retried = 0; int cleaned = 0; int new_fd = -1; int i; if (!ls->name[0]) return -EINVAL; pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, NULL, 0, &spaces, NULL, NULL, NULL); if (!sp) { pthread_mutex_unlock(&spaces_mutex); log_error("lockspace_reg_event %.48s not found", ls->name); return -ENOENT; } pthread_mutex_unlock(&spaces_mutex); retry: pthread_mutex_lock(&sp->mutex); for (i = 0; i < MAX_EVENT_FDS; i++) { if (sp->event_fds[i] != -1) continue; /* _client_free closes the fd when the reg_event call is done, so we dup it here instead of adding a special case in _client free to keep it open. */ new_fd = dup(fd); sp->event_fds[i] = new_fd; break; } pthread_mutex_unlock(&sp->mutex); log_space(sp, "lockspace_reg_event new_fd %d from client fd %d", new_fd, fd); if (new_fd < 0) { if (retried) return -ENOCSI; cleaned = _clean_event_fds(sp); if (!cleaned) return -ENOCSI; retried = 1; goto retry; } return 0; } int lockspace_set_event(struct sanlk_lockspace *ls, struct sanlk_host_event *he, uint32_t flags) { struct space *sp; struct host_status *hs; uint64_t now; int i, rv = 0; if (!ls->name[0] || !he->host_id || he->host_id > DEFAULT_MAX_HOSTS) { log_error("set_event invalid args host_id %llu name %.48s", (unsigned long long)he->host_id, ls->name); return -EINVAL; } pthread_mutex_lock(&spaces_mutex); sp = _search_space(ls->name, NULL, 0, &spaces, NULL, NULL, NULL); if (!sp) { pthread_mutex_unlock(&spaces_mutex); return -ENOENT; } pthread_mutex_unlock(&spaces_mutex); if (he->host_id > sp->max_hosts) { log_error("set_event host_id %llu too large max %u %s", (unsigned long long)he->host_id, sp->max_hosts, ls->name); return -EINVAL; } if (!he->generation && (flags & SANLK_SETEV_CUR_GENERATION)) { hs = &(sp->host_status[he->host_id-1]); he->generation = hs->owner_generation; } now = monotime(); pthread_mutex_lock(&sp->mutex); if (flags & SANLK_SETEV_CLEAR_EVENT) { memset(&sp->host_event, 0, sizeof(struct sanlk_host_event)); sp->set_event_time = now; goto out; } if (flags & SANLK_SETEV_CLEAR_HOSTID) { sp->host_status[he->host_id-1].set_bit_time = 0; goto out; } if (flags & SANLK_SETEV_REPLACE_EVENT) goto set; /* log a warning if one non-zero event clobbers another non-zero event */ if ((now - sp->set_event_time < sp->set_bitmap_seconds) && sp->host_event.event && he->event && (sp->host_event.event != he->event)) { log_warns(sp, "event %llu %llu %llu %llu replaced by %llu %llu %llu %llu t %llu", (unsigned long long)sp->host_event.host_id, (unsigned long long)sp->host_event.generation, (unsigned long long)sp->host_event.event, (unsigned long long)sp->host_event.data, (unsigned long long)he->host_id, (unsigned long long)he->generation, (unsigned long long)he->event, (unsigned long long)he->data, (unsigned long long)sp->set_event_time); rv = -EBUSY; goto out; } set: sp->set_event_time = now; sp->host_status[he->host_id-1].set_bit_time = now; memcpy(&sp->host_event, he, sizeof(struct sanlk_host_event)); if (flags & SANLK_SETEV_ALL_HOSTS) { for (i = 0; i < sp->max_hosts; i++) sp->host_status[i].set_bit_time = now; } out: pthread_mutex_unlock(&sp->mutex); return rv; } int send_event_callbacks(uint32_t space_id, uint64_t from_host_id, uint64_t from_generation, struct sanlk_host_event *he) { struct space *sp; struct event_cb cb; int fd, i; int rv = 0; memset(&cb, 0, sizeof(cb)); cb.h.magic = SM_MAGIC; cb.h.version = SM_CB_PROTO; cb.h.cmd = SM_CB_GET_EVENT; cb.h.length = sizeof(cb); memcpy(&cb.he, he, sizeof(struct sanlk_host_event)); cb.from_host_id = from_host_id; cb.from_generation = from_generation; pthread_mutex_lock(&spaces_mutex); sp = find_lockspace_id(space_id); if (!sp) { pthread_mutex_unlock(&spaces_mutex); rv = -ENOSPC; goto ret; } pthread_mutex_unlock(&spaces_mutex); pthread_mutex_lock(&sp->mutex); for (i = 0; i < MAX_EVENT_FDS; i++) { if (sp->event_fds[i] == -1) continue; fd = sp->event_fds[i]; rv = send(fd, &cb, sizeof(cb), MSG_NOSIGNAL | MSG_DONTWAIT); if (rv < 0) { log_erros(sp, "send_event_callbacks error %d %d close fd %d", rv, errno, fd); close(fd); sp->event_fds[i] = -1; } log_space(sp, "sent event to fd %d", fd); } pthread_mutex_unlock(&sp->mutex); ret: return rv; } /* * we call stop_host_id() when all pids are gone and we're in a safe state, so * it's safe to unlink the watchdog right away here. We want to sp the unlink * as soon as it's safe, so we can reduce the chance we get killed by the * watchdog (we could actually call this in main_loop just before the break). * Getting this unlink done quickly is more important than doing at the more * "logical" point commented above in host_id_thread. */ static int stop_lockspace_thread(struct space *sp, int wait) { int stop, rv; pthread_mutex_lock(&sp->mutex); stop = sp->thread_stop; sp->thread_stop = 1; pthread_mutex_unlock(&sp->mutex); if (!stop) { /* should never happen */ log_erros(sp, "stop_lockspace_thread zero thread_stop"); return -EINVAL; } if (wait) rv = pthread_join(sp->thread, NULL); else rv = pthread_tryjoin_np(sp->thread, NULL); return rv; } /* * locking/lifetime rules for a struct space * * multiple factors: * . spaces_mutex * . sp->mutex * . the specific thread: main daemon thread, lockspace thread, worker thread * * spaces, spaces_add, spaces_rem lists are protected by spaces_mutex * * sp->mutex protects info that is exchanged between the lockspace thread * (for the sp) and the main thread. This is primarily sp->thread_stop, * and sp->lease_status (although it seems a couple other bits of info * have been added over time that are communicated between the lockspace * thread and the main thread). * * add_lockspace_start(), called by worker thread, creates sp, * adds it to spaces_add list under spaces_mutex, creates lockspace_thread * for the sp. * * lockspace_thread never has to worry about sp going away and can access * sp directly any time. The sp will not be freed until lockspace_thread * has exited. * * The main thread never has to worry about sp going away, because the * main thread is the only context in which sp structs are freed * (and that only happens in free_lockspaces). * * add_lockspace_wait(), called by worker thread, can access sp directly * because sp won't go away while it's on spaces_add. Only add_lockspace_wait * can do something with sp while it's on spaces_add. _wait uses sp->mutex * to exchange lease status with lockspace_thread. Once the host_id lease * is acquired, _wait moves sp from spaces_add to spaces under spaces_mutex. * After sp is moved to spaces list, its lifetime is owned by the main thread. * * While sp is on spaces list, its lifetime is controlled by the main thread. * Apart from lockspace_thread, any other thread, e.g. worker thread, must * lock spaces_mutex, look up sp on spaces list, access sp fields, then unlock * spaces_mutex. After releasing spaces_mutex, it can't access sp struct * because the main thread could dispose of it. If the worker thread wants * to look at info that's being updated by the lockspace_thread, it should * also take sp->mutex before copying it. * * I currently see some violations of proper sp access that should be fixed. * The bad pattern in each case is: lock spaces_mutex, find sp, * unlock spaces_mutex, lock sp->mutex. The sp could in theory go away between * unlock spaces_mutex and lock sp->mutex. (In practice this would * likely never happen.) * * . worker_thread lockspace_set_event() * (reg_event and end_event are ok since they are called from * the main thread) * * . worker_thread host_status_set_bit() * * . resource_thread send_event_callbacks() does the same. * * I'm not sure what the best solution would be: lock sp->mutex before * unlocking spaces_mutex? Do everything under spaces_mutex? Add * a simple ref count to sp for these cases of using sp from other * threads? * * cmd_rem_lockspace() is run by a worker_thread. rem_lockspace_start() * locks spaces_mutex, finds sp, sets sp->external_remove, unlocks spaces_mutex. * Then the main thread, which owns the sp structs, sees sp->external_remove, * kills any pids using the sp, and when the sp is no longer used, it sets * sp->thread_stop, and moves sp from spaces list to spaces_rem list. * The main thread then runs free_lockspaces() which stops the lockspace_thread * for sp's on spaces_rem. When the lockspace_thread exits, the main thread * then removes sp from spaces_rem and frees sp. */ void free_lockspaces(int wait) { struct space *sp, *safe; int rv; pthread_mutex_lock(&spaces_mutex); list_for_each_entry_safe(sp, safe, &spaces_rem, list) { rv = stop_lockspace_thread(sp, wait); if (!rv) { log_space(sp, "free lockspace"); list_del(&sp->list); free_sp(sp); } } pthread_mutex_unlock(&spaces_mutex); } sanlock-3.8.2/src/lockspace.h000066400000000000000000000054121371427612200160710ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LOCKSPACE_H__ #define __LOCKSPACE__H__ /* See resource.h for lock ordering between spaces_mutex and resource_mutex. */ /* no locks */ struct space *find_lockspace(const char *name); /* no locks */ int _lockspace_info(const char *space_name, struct space_info *spi); /* locks spaces_mutex */ int lockspace_info(const char *space_name, struct space_info *spi); /* locks spaces_mutex */ int lockspace_disk(char *space_name, struct sync_disk *disk, int *sector_size); /* locks spaces_mutex */ int host_info(char *space_name, uint64_t host_id, struct host_status *hs_out); /* locks spaces_mutex, locks sp */ int host_status_set_bit(char *space_name, uint64_t host_id); /* no locks */ int test_id_bit(int host_id, char *bitmap); /* no locks */ void set_id_bit(int host_id, char *bitmap, char *c); /* locks sp */ int check_our_lease(struct space *sp, int *check_all, char *check_buf); /* locks resource_mutex (add_host_event), locks resource_mutex (set_resource_examine) */ void check_other_leases(struct space *sp, char *buf); /* locks spaces_mutex */ int add_lockspace_start(struct sanlk_lockspace *ls, uint32_t io_timeout, struct space **sp_out); /* locks sp, locks spaces_mutex */ int add_lockspace_wait(struct space *sp); /* locks spaces_mutex */ int inq_lockspace(struct sanlk_lockspace *ls); /* locks spaces_mutex */ int rem_lockspace_start(struct sanlk_lockspace *ls, unsigned int *space_id); /* locks spaces_mutex */ int rem_lockspace_wait(struct sanlk_lockspace *ls, unsigned int space_id); /* locks spaces_mutex, locks sp */ void free_lockspaces(int wait); /* locks spaces_mutex */ int get_lockspaces(char *buf, int *len, int *count, int maxlen); /* locks spaces_mutex */ int get_hosts(struct sanlk_lockspace *ls, char *buf, int *len, int *count, int maxlen); /* locks spaces_mutex, locks sp */ int lockspace_set_event(struct sanlk_lockspace *ls, struct sanlk_host_event *he, uint32_t flags); /* locks spaces_mutex, locks sp */ int lockspace_reg_event(struct sanlk_lockspace *ls, int fd, uint32_t flags); /* locks spaces_mutex, locks sp */ int lockspace_end_event(struct sanlk_lockspace *ls); /* locks spaces_mutex, locks sp */ int send_event_callbacks(uint32_t space_id, uint64_t from_host_id, uint64_t from_generation, struct sanlk_host_event *he); /* locks spaces_mutex, locks sp */ int lockspace_set_config(struct sanlk_lockspace *ls, uint32_t flags, uint32_t cmd); int lockspace_begin_rindex_op(char *space_name, int rindex_op, struct space_info *spi); int lockspace_clear_rindex_op(char *space_name); #endif sanlock-3.8.2/src/log.c000066400000000000000000000164621371427612200147100ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #define LOG_STR_LEN 512 static char log_str[LOG_STR_LEN]; static pthread_t thread_handle; static pthread_mutex_t log_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t log_cond = PTHREAD_COND_INITIALIZER; static char log_dump[LOG_DUMP_SIZE]; static unsigned int log_point; static unsigned int log_wrap; struct entry { int level; char str[LOG_STR_LEN]; }; #define LOG_DEFAULT_ENTRIES 4096 static struct entry *log_ents; static unsigned int log_num_ents = LOG_DEFAULT_ENTRIES; static unsigned int log_head_ent; /* add at head */ static unsigned int log_tail_ent; /* remove from tail */ static unsigned int log_dropped; static unsigned int log_pending_ents; static unsigned int log_thread_done; static char logfile_path[PATH_MAX]; static FILE *logfile_fp; extern int log_logfile_priority; extern int log_logfile_use_utc; extern int log_syslog_priority; extern int log_stderr_priority; static void _log_save_dump(int level GNUC_UNUSED, int len) { int i; if (len < LOG_DUMP_SIZE - log_point) { memcpy(log_dump+log_point, log_str, len); log_point += len; if (log_point == LOG_DUMP_SIZE) { log_point = 0; log_wrap = 1; } return; } for (i = 0; i < len; i++) { log_dump[log_point++] = log_str[i]; if (log_point == LOG_DUMP_SIZE) { log_point = 0; log_wrap = 1; } } } static void _log_save_ent(int level, int len) { struct entry *e; if (!log_ents) return; if (log_pending_ents == log_num_ents) { log_dropped++; return; } e = &log_ents[log_head_ent++]; log_head_ent = log_head_ent % log_num_ents; log_pending_ents++; e->level = level; memcpy(e->str, log_str, len); } /* * This log function: * 1. formats the log message in the log_str buffer * 2. copies log_str into the log_dump circular buffer * 3. copies log_str into the log_ents circular array to be written to * logfile and/or syslog (so callers don't block writing messages to files) * * N.B. level as "int" instead of "uint32_t" is needed because * of comparison with int log_stderr_priority which can be -1. */ void log_level(uint32_t space_id, uint32_t res_id, char *name_in, int level, const char *fmt, ...) { va_list ap; char name[NAME_ID_SIZE + 1]; int ret, pos = 0; int len = LOG_STR_LEN - 2; /* leave room for \n\0 */ struct timeval cur_time; struct tm time_info; pid_t tid; memset(name, 0, sizeof(name)); if (level == LOG_CLIENT) { int log_ci = 0, log_fd = 0; if (!com.debug_clients) return; level = LOG_DEBUG; log_ci = space_id; log_fd = res_id; if (!name_in) snprintf(name, NAME_ID_SIZE, "cl %d:%d ", log_ci, log_fd); else snprintf(name, NAME_ID_SIZE, "cl %d:%d %.8s ", log_ci, log_fd, name_in); } else if (level == LOG_CMD) { uint32_t cmd = space_id; if (!is_cmd_debug(cmd)) return; space_id = 0; level = LOG_DEBUG; } else { if (space_id && !res_id) snprintf(name, NAME_ID_SIZE, "s%u ", space_id); else if (!space_id && res_id) snprintf(name, NAME_ID_SIZE, "r%u ", res_id); else if (space_id && res_id) snprintf(name, NAME_ID_SIZE, "s%u:r%u ", space_id, res_id); else if (name_in) snprintf(name, NAME_ID_SIZE, "%.8s ", name_in); } pthread_mutex_lock(&log_mutex); gettimeofday(&cur_time, NULL); if (log_logfile_use_utc) gmtime_r(&cur_time.tv_sec, &time_info); else localtime_r(&cur_time.tv_sec, &time_info); ret = strftime(log_str + pos, len - pos, "%Y-%m-%d %H:%M:%S ", &time_info); pos += ret; tid = syscall(SYS_gettid); ret = snprintf(log_str + pos, len - pos, "%llu [%u]: %s", (unsigned long long) monotime(), tid, name); pos += ret; va_start(ap, fmt); ret = vsnprintf(log_str + pos, len - pos, fmt, ap); va_end(ap); if (ret >= len - pos) pos = len - 1; else pos += ret; log_str[pos++] = '\n'; log_str[pos++] = '\0'; /* * save all messages in circular buffer "log_dump" that can be * sent over unix socket */ _log_save_dump(level, pos - 1); /* * save some messages in circular array "log_ents" that a thread * writes to logfile/syslog */ if (level <= log_logfile_priority || level <= log_syslog_priority) _log_save_ent(level, pos); if (level <= log_stderr_priority) fprintf(stderr, "%s", log_str); pthread_cond_signal(&log_cond); pthread_mutex_unlock(&log_mutex); } static void write_entry(int level, char *str) { if ((level <= log_logfile_priority) && logfile_fp) { fprintf(logfile_fp, "%s", str); fflush(logfile_fp); } if (level <= log_syslog_priority) syslog(level, "%s", str); } static void write_dropped(int level, int num) { char str[LOG_STR_LEN]; sprintf(str, "dropped %d entries", num); write_entry(level, str); } void copy_log_dump(char *buf, int *len) { int tail_len; pthread_mutex_lock(&log_mutex); if (!log_wrap && !log_point) { *len = 0; } else if (log_wrap) { tail_len = LOG_DUMP_SIZE - log_point; memcpy(buf, log_dump+log_point, tail_len); if (log_point) memcpy(buf+tail_len, log_dump, log_point); *len = LOG_DUMP_SIZE; } else { memcpy(buf, log_dump, log_point-1); *len = log_point-1; } pthread_mutex_unlock(&log_mutex); } static void *log_thread_fn(void *arg GNUC_UNUSED) { char str[LOG_STR_LEN]; struct entry *e; int level, prev_dropped = 0; while (1) { pthread_mutex_lock(&log_mutex); while (log_head_ent == log_tail_ent) { if (log_thread_done) { pthread_mutex_unlock(&log_mutex); goto out; } pthread_cond_wait(&log_cond, &log_mutex); } e = &log_ents[log_tail_ent++]; log_tail_ent = log_tail_ent % log_num_ents; log_pending_ents--; memcpy(str, e->str, LOG_STR_LEN); level = e->level; prev_dropped = log_dropped; log_dropped = 0; pthread_mutex_unlock(&log_mutex); if (prev_dropped) { write_dropped(level, prev_dropped); prev_dropped = 0; } write_entry(level, str); } out: pthread_exit(NULL); } int setup_logging(void) { int fd, rv; snprintf(logfile_path, PATH_MAX, "%s/%s", SANLK_LOG_DIR, SANLK_LOGFILE_NAME); logfile_fp = fopen(logfile_path, "a+"); if (logfile_fp) { fd = fileno(logfile_fp); fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC); } log_ents = malloc(log_num_ents * sizeof(struct entry)); if (!log_ents) { fclose(logfile_fp); logfile_fp = NULL; return -1; } memset(log_ents, 0, log_num_ents * sizeof(struct entry)); openlog(DAEMON_NAME, LOG_CONS | LOG_PID, LOG_DAEMON); rv = pthread_create(&thread_handle, NULL, log_thread_fn, NULL); if (rv) return -1; return 0; } void close_logging(void) { pthread_mutex_lock(&log_mutex); log_thread_done = 1; pthread_cond_signal(&log_cond); pthread_mutex_unlock(&log_mutex); pthread_join(thread_handle, NULL); pthread_mutex_lock(&log_mutex); closelog(); if (logfile_fp) { fclose(logfile_fp); logfile_fp = NULL; } pthread_mutex_unlock(&log_mutex); } sanlock-3.8.2/src/log.h000066400000000000000000000056421371427612200147130ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __LOG_H__ #define __LOG_H__ #define LOG_CLIENT LOG_LOCAL0 #define LOG_CMD LOG_LOCAL1 /* * Log levels are used mainly to indicate where the message * should be recorded: * * log_error() write to /var/log/messages and /var/log/sanlock.log * log_level(WARNING) write to /var/log/sanlock.log * log_debug() write to incore buffer, not to file * * Anything in /var/log/messages should not happen and should be reported. * So anything we want to visible and reported should be LOG_ERR. * * If we want to log something to assist in debugging, but not be reported, * it should be LOG_WARNING (goes only to sanlock.log) */ void log_level(uint32_t space_id, uint32_t res_id, char *name_in, int level, const char *fmt, ...) __attribute__((format(printf, 5, 6))); int setup_logging(void); void close_logging(void); void copy_log_dump(char *buf, int *len); #define log_debug(fmt, args...) log_level(0, 0, NULL, LOG_DEBUG, fmt, ##args) #define log_space(space, fmt, args...) log_level(space->space_id, 0, NULL, LOG_DEBUG, fmt, ##args) #define log_token(token, fmt, args...) log_level(token->space_id, token->res_id, NULL, LOG_DEBUG, fmt, ##args) #define log_sid(space_id, fmt, args...) log_level(space_id, 0, NULL, LOG_DEBUG, fmt, ##args) #define log_rid(res_id, fmt, args...) log_level(res_id, 0, NULL, LOG_DEBUG, fmt, ##args) #define log_warn(fmt, args...) log_level(0, 0, NULL, LOG_WARNING, fmt, ##args) #define log_warns(space, fmt, args...) log_level(space->space_id, 0, NULL, LOG_WARNING, fmt, ##args) #define log_warnt(token, fmt, args...) log_level(token->space_id, token->res_id, NULL, LOG_WARNING, fmt, ##args) #define log_error(fmt, args...) log_level(0, 0, NULL, LOG_ERR, fmt, ##args) #define log_erros(space, fmt, args...) log_level(space->space_id, 0, NULL, LOG_ERR, fmt, ##args) #define log_errot(token, fmt, args...) log_level(token->space_id, token->res_id, NULL, LOG_ERR, fmt, ##args) #define log_taske(task, fmt, args...) log_level(0, 0, task->name, LOG_ERR, fmt, ##args) #define log_taskw(task, fmt, args...) log_level(0, 0, task->name, LOG_WARNING, fmt, ##args) #define log_taskd(task, fmt, args...) log_level(0, 0, task->name, LOG_DEBUG, fmt, ##args) #define log_client(ci, fd, fmt, args...) log_level(ci, fd, NULL, LOG_CLIENT, fmt, ##args) #define log_cmd(cmd, fmt, args...) log_level(cmd, 0, NULL, LOG_CMD, fmt, ##args) /* use log_tool for tool actions (non-daemon), and for daemon until logging is set up */ #define log_tool(fmt, args...) \ do { \ printf(fmt "\n", ##args); \ } while (0) #endif sanlock-3.8.2/src/logrotate.sanlock000066400000000000000000000002571371427612200173320ustar00rootroot00000000000000/var/log/sanlock.log { rotate 3 missingok copytruncate size 10M compress compresscmd /usr/bin/xz uncompresscmd /usr/bin/unxz compressext .xz } sanlock-3.8.2/src/main.c000066400000000000000000002763601371427612200150600ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define EXTERN #include "sanlock_internal.h" #include "sanlock_sock.h" #include "sanlock_resource.h" #include "sanlock_admin.h" #include "diskio.h" #include "log.h" #include "lockspace.h" #include "resource.h" #include "direct.h" #include "lockfile.h" #include "watchdog.h" #include "task.h" #include "client_cmd.h" #include "cmd.h" #include "helper.h" #include "timeouts.h" #include "paxos_lease.h" #include "env.h" #define SIGRUNPATH 100 /* anything that's not SIGTERM/SIGKILL */ struct thread_pool { int num_workers; int max_workers; int free_workers; int quit; struct list_head work_data; pthread_mutex_t mutex; pthread_cond_t cond; pthread_cond_t quit_wait; }; /* priorities are LOG_* from syslog.h */ int log_logfile_priority = LOG_WARNING; int log_logfile_use_utc = 0; int log_syslog_priority = LOG_ERR; int log_stderr_priority = -1; /* -D sets this to LOG_DEBUG */ #define CLIENT_NALLOC 1024 static int client_maxi; static int client_size = 0; static struct pollfd *pollfd; static char command[COMMAND_MAX]; static int cmd_argc; static char **cmd_argv; static struct thread_pool pool; static char rand_state[32]; static pthread_mutex_t rand_mutex = PTHREAD_MUTEX_INITIALIZER; static const char *run_dir = NULL; static int privileged = 1; static void close_helper(void) { close(helper_kill_fd); close(helper_status_fd); helper_kill_fd = -1; helper_status_fd = -1; pollfd[helper_ci].fd = -1; pollfd[helper_ci].events = 0; helper_ci = -1; /* don't set helper_pid = -1 until we've tried waitpid */ } /* * We cannot block the main thread on this write, so the pipe is * NONBLOCK, and write fails with EAGAIN when the pipe is full. * With 512 msg size and 64k default pipe size, the pipe will be full * if we quickly send kill messages for 128 pids. We retry * the kill once a second, so we'll retry the write again in * a second. * * By setting the pipe size to 1MB in setup_helper, we could quickly send 2048 * msgs before getting EAGAIN. */ static void send_helper_kill(struct space *sp, struct client *cl, int sig) { struct helper_msg hm; int rv; /* * We come through here once a second while the pid still has * leases. We only send a single RUNPATH message, so after * the first RUNPATH goes through we set CL_RUNPATH_SENT to * avoid futher RUNPATH's. */ if ((cl->flags & CL_RUNPATH_SENT) && (sig == SIGRUNPATH)) return; if (helper_kill_fd == -1) { log_error("send_helper_kill pid %d no fd", cl->pid); return; } memset(&hm, 0, sizeof(hm)); if (sig == SIGRUNPATH) { hm.type = HELPER_MSG_RUNPATH; memcpy(hm.path, cl->killpath, SANLK_HELPER_PATH_LEN); memcpy(hm.args, cl->killargs, SANLK_HELPER_ARGS_LEN); /* only include pid if it's requested as a killpath arg */ if (cl->flags & CL_KILLPATH_PID) hm.pid = cl->pid; } else { hm.type = HELPER_MSG_KILLPID; hm.sig = sig; hm.pid = cl->pid; } log_erros(sp, "kill %d sig %d count %d", cl->pid, sig, cl->kill_count); retry: rv = write(helper_kill_fd, &hm, sizeof(hm)); if (rv == -1 && errno == EINTR) goto retry; /* pipe is full, we'll try again in a second */ if (rv == -1 && errno == EAGAIN) { helper_full_count++; log_space(sp, "send_helper_kill pid %d sig %d full_count %u", cl->pid, sig, helper_full_count); return; } /* helper exited or closed fd, quit using helper */ if (rv == -1 && errno == EPIPE) { log_erros(sp, "send_helper_kill EPIPE"); close_helper(); return; } if (rv != sizeof(hm)) { /* this shouldn't happen */ log_erros(sp, "send_helper_kill pid %d error %d %d", cl->pid, rv, errno); close_helper(); return; } if (sig == SIGRUNPATH) cl->flags |= CL_RUNPATH_SENT; } /* FIXME: add a mutex for client array so we don't try to expand it while a cmd thread is using it. Or, with a thread pool we know when cmd threads are running and can expand when none are. */ static int client_alloc(void) { int i; /* pollfd is one element longer as we use an additional element for the * eventfd notification mechanism */ client = malloc(CLIENT_NALLOC * sizeof(struct client)); pollfd = malloc((CLIENT_NALLOC+1) * sizeof(struct pollfd)); if (!client || !pollfd) { log_error("can't alloc for client or pollfd array"); return -ENOMEM; } for (i = 0; i < CLIENT_NALLOC; i++) { memset(&client[i], 0, sizeof(struct client)); memset(&pollfd[i], 0, sizeof(struct pollfd)); pthread_mutex_init(&client[i].mutex, NULL); client[i].fd = -1; client[i].pid = -1; pollfd[i].fd = -1; pollfd[i].events = 0; } client_size = CLIENT_NALLOC; return 0; } static void _client_free(int ci) { struct client *cl = &client[ci]; if (cl->cmd_active || cl->pid_dead) log_client(ci, cl->fd, "free cmd %d dead %d", cl->cmd_active, cl->pid_dead); else log_client(ci, cl->fd, "free"); if (!cl->used) { /* should never happen */ log_error("client_free ci %d not used", ci); goto out; } if (cl->pid != -1) { /* client_pid_dead() should have set pid to -1 */ /* should never happen */ log_error("client_free ci %d live pid %d", ci, cl->pid); goto out; } if (cl->fd == -1) { /* should never happen */ log_error("client_free ci %d is free", ci); goto out; } if (cl->need_free) log_debug("client_free ci %d already need_free", ci); if (cl->suspend) { log_debug("client_free ci %d is suspended", ci); cl->need_free = 1; goto out; } if (cl->fd != -1) close(cl->fd); cl->used = 0; cl->fd = -1; cl->pid = -1; cl->cmd_active = 0; cl->pid_dead = 0; cl->suspend = 0; cl->need_free = 0; cl->kill_count = 0; cl->kill_last = 0; cl->restricted = 0; cl->flags = 0; memset(cl->owner_name, 0, sizeof(cl->owner_name)); memset(cl->killpath, 0, SANLK_HELPER_PATH_LEN); memset(cl->killargs, 0, SANLK_HELPER_ARGS_LEN); cl->workfn = NULL; cl->deadfn = NULL; if (cl->tokens) free(cl->tokens); cl->tokens = NULL; cl->tokens_slots = 0; /* make poll() ignore this connection */ pollfd[ci].fd = -1; pollfd[ci].events = 0; pollfd[ci].revents = 0; out: return; } void client_free(int ci); void client_free(int ci) { struct client *cl = &client[ci]; pthread_mutex_lock(&cl->mutex); _client_free(ci); pthread_mutex_unlock(&cl->mutex); } /* the connection that we suspend and resume may or may not be the same connection as the target client where we set cmd_active */ static int client_suspend(int ci) { struct client *cl = &client[ci]; int rv = 0; pthread_mutex_lock(&cl->mutex); if (!cl->used) { /* should never happen */ log_error("client_suspend ci %d not used", ci); rv = -1; goto out; } if (cl->fd == -1) { /* should never happen */ log_error("client_suspend ci %d is free", ci); rv = -1; goto out; } if (cl->suspend) { /* should never happen */ log_error("client_suspend ci %d is suspended", ci); rv = -1; goto out; } log_client(ci, cl->fd, "suspend"); cl->suspend = 1; /* make poll() ignore this connection */ pollfd[ci].fd = -1; pollfd[ci].events = 0; out: pthread_mutex_unlock(&cl->mutex); return rv; } void client_resume(int ci); void client_resume(int ci) { struct client *cl = &client[ci]; pthread_mutex_lock(&cl->mutex); if (!cl->used) { /* should never happen */ log_error("client_resume ci %d not used", ci); goto out; } if (cl->fd == -1) { /* should never happen */ log_error("client_resume ci %d is free", ci); goto out; } if (!cl->suspend) { /* should never happen */ log_error("client_resume ci %d not suspended", ci); goto out; } log_client(ci, cl->fd, "resume"); cl->suspend = 0; if (cl->need_free) { log_debug("client_resume ci %d need_free", ci); _client_free(ci); } else { /* make poll() watch this connection */ pollfd[ci].fd = cl->fd; pollfd[ci].events = POLLIN; /* interrupt any poll() that might already be running */ eventfd_write(efd, 1); } out: pthread_mutex_unlock(&cl->mutex); } static int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)) { struct client *cl; int i; for (i = 0; i < client_size; i++) { cl = &client[i]; pthread_mutex_lock(&cl->mutex); if (!cl->used) { cl->used = 1; cl->fd = fd; cl->workfn = workfn; cl->deadfn = deadfn ? deadfn : client_free; /* make poll() watch this connection */ pollfd[i].fd = fd; pollfd[i].events = POLLIN; if (i > client_maxi) client_maxi = i; pthread_mutex_unlock(&cl->mutex); log_client(i, fd, "add"); return i; } pthread_mutex_unlock(&cl->mutex); } return -1; } /* clear the unreceived portion of an aborted command */ void client_recv_all(int ci, struct sm_header *h_recv, int pos); void client_recv_all(int ci, struct sm_header *h_recv, int pos) { char trash[64]; int rem = h_recv->length - sizeof(struct sm_header) - pos; int rv, error = 0, total = 0, retries = 0; if (!rem) return; while (1) { rv = recv(client[ci].fd, trash, sizeof(trash), MSG_DONTWAIT); if (rv == -1 && errno == EAGAIN) { usleep(1000); if (retries < 20) { retries++; continue; } } if (rv == -1) error = errno; if (rv <= 0) break; total += rv; if (total >= rem) break; } log_debug("client recv_all %d,%d,%d pos %d rv %d error %d retries %d rem %d total %d", ci, client[ci].fd, client[ci].pid, pos, rv, error, retries, rem, total); } void send_result(int ci, int fd, struct sm_header *h_recv, int result); void send_result(int ci, int fd, struct sm_header *h_recv, int result) { struct sm_header h; log_client(ci, fd, "send %d", result); memcpy(&h, h_recv, sizeof(struct sm_header)); h.version = SM_PROTO; h.length = sizeof(h); h.data = result; h.data2 = 0; send(fd, &h, sizeof(h), MSG_NOSIGNAL); } void client_pid_dead(int ci); void client_pid_dead(int ci) { struct client *cl = &client[ci]; int cmd_active; int i, pid; /* cmd_acquire_thread may still be waiting for the tokens to be acquired. if it is, cl->pid_dead tells it to release them when finished. Similarly, cmd_release_thread, cmd_inquire_thread are accessing cl->tokens */ pthread_mutex_lock(&cl->mutex); if (!cl->used || cl->fd == -1 || cl->pid == -1) { /* should never happen */ pthread_mutex_unlock(&cl->mutex); log_error("client_pid_dead %d,%d,%d u %d a %d s %d bad state", ci, cl->fd, cl->pid, cl->used, cl->cmd_active, cl->suspend); return; } log_debug("client_pid_dead %d,%d,%d cmd_active %d suspend %d", ci, cl->fd, cl->pid, cl->cmd_active, cl->suspend); if (cl->kill_count) log_error("dead %d ci %d count %d", cl->pid, ci, cl->kill_count); cmd_active = cl->cmd_active; pid = cl->pid; cl->pid = -1; cl->pid_dead = 1; /* when cmd_active is set and cmd_a,r,i_thread is done and takes cl->mutex to set cl->cmd_active to 0, it will see cl->pid_dead is 1 and know they need to release cl->tokens and call client_free */ /* make poll() ignore this connection */ pollfd[ci].fd = -1; pollfd[ci].events = 0; pthread_mutex_unlock(&cl->mutex); /* it would be nice to do this SIGKILL as a confirmation that the pid is really gone (i.e. didn't just close the fd) if we always had root permission to do it */ /* kill(pid, SIGKILL); */ if (cmd_active) { log_debug("client_pid_dead %d,%d,%d defer to cmd %d", ci, cl->fd, pid, cmd_active); return; } /* use async release here because this is the main thread that we don't want to block doing disk lease i/o */ pthread_mutex_lock(&cl->mutex); for (i = 0; i < cl->tokens_slots; i++) { if (cl->tokens[i]) { release_token_async(cl->tokens[i]); free(cl->tokens[i]); } } _client_free(ci); pthread_mutex_unlock(&cl->mutex); } /* At some point we may want to keep a record of each pid using a lockspace in the sp struct to avoid walking through each client's cl->tokens to see if it's using the lockspace. It should be the uncommon situation where a lockspace renewal fails and we need to walk through all client tokens like this. i.e. we'd probably not want to optimize for this case at the expense of the more common case where a pid exits, but we do want it to be robust. The locking is also made a bit ugly by these three routines that need to correlate which clients are using which lockspaces. (client_using_space, kill_pids, all_pids_dead) spaces_mutex is held when they are called, and they need to take cl->mutex. This means that cmd_acquire_thread has to lock both spaces_mutex and cl->mutex when adding new tokens to the client. (It needs to check that the lockspace for the new tokens hasn't failed while the tokens were being acquired.) In kill_pids and all_pids_dead could we check cl->pid <= 0 without taking cl->mutex, since client_pid_dead in the main thread is the only place that changes that? */ static int client_using_space(struct client *cl, struct space *sp) { struct token *token; int i, rv = 0; for (i = 0; i < cl->tokens_slots; i++) { token = cl->tokens[i]; if (!token) continue; if (strncmp(token->r.lockspace_name, sp->space_name, NAME_ID_SIZE)) continue; if (!cl->kill_count) log_token(token, "client_using_space pid %d", cl->pid); if (sp->space_dead) token->space_dead = sp->space_dead; rv = 1; } return rv; } static void kill_pids(struct space *sp) { struct client *cl; uint64_t now, last_success; int id_renewal_fail_seconds; int ci, sig; int do_kill, in_grace; /* * all remaining pids using sp are stuck, we've made max attempts to * kill all, don't bother cycling through them */ if (sp->killing_pids > 1) return; id_renewal_fail_seconds = calc_id_renewal_fail_seconds(sp->io_timeout); /* * If we happen to renew our lease after we've started killing pids, * the period we allow for graceful shutdown will be extended. This * is an incidental effect, although it may be nice. The previous * behavior would still be ok, where we only ever allow up to * kill_grace_seconds for graceful shutdown before moving to sigkill. */ pthread_mutex_lock(&sp->mutex); last_success = sp->lease_status.renewal_last_success; pthread_mutex_unlock(&sp->mutex); now = monotime(); for (ci = 0; ci <= client_maxi; ci++) { do_kill = 0; cl = &client[ci]; pthread_mutex_lock(&cl->mutex); if (!cl->used) goto unlock; if (cl->pid <= 0) goto unlock; /* NB this cl may not be using sp, but trying to avoid the expensive client_using_space check */ if (cl->kill_count >= kill_count_max) goto unlock; if (cl->kill_count && (now - cl->kill_last < 1)) goto unlock; if (!client_using_space(cl, sp)) goto unlock; cl->kill_last = now; cl->kill_count++; /* * the transition from using killpath/sigterm to sigkill * is when now >= * last successful lease renewal + * id_renewal_fail_seconds + * kill_grace_seconds */ in_grace = now < (last_success + id_renewal_fail_seconds + kill_grace_seconds); if (sp->external_remove || (external_shutdown > 1)) { sig = SIGKILL; } else if ((kill_grace_seconds > 0) && in_grace && cl->killpath[0]) { sig = SIGRUNPATH; } else if (in_grace) { sig = SIGTERM; } else { sig = SIGKILL; } /* * sigterm will be used in place of sigkill if restricted * sigkill will be used in place of sigterm if restricted */ if ((sig == SIGKILL) && (cl->restricted & SANLK_RESTRICT_SIGKILL)) sig = SIGTERM; if ((sig == SIGTERM) && (cl->restricted & SANLK_RESTRICT_SIGTERM)) sig = SIGKILL; do_kill = 1; unlock: pthread_mutex_unlock(&cl->mutex); if (!do_kill) continue; send_helper_kill(sp, cl, sig); } } static int all_pids_dead(struct space *sp) { struct client *cl; int stuck = 0, check = 0; int ci; for (ci = 0; ci <= client_maxi; ci++) { cl = &client[ci]; pthread_mutex_lock(&cl->mutex); if (!cl->used) goto unlock; if (cl->pid <= 0) goto unlock; if (!client_using_space(cl, sp)) goto unlock; if (cl->kill_count >= kill_count_max) stuck++; else check++; unlock: pthread_mutex_unlock(&cl->mutex); } if (stuck && !check && sp->killing_pids < 2) { log_erros(sp, "killing pids stuck %d", stuck); /* cause kill_pids to give up */ sp->killing_pids = 2; } if (stuck || check) return 0; if (sp->flags & SP_EXTERNAL_USED) { if (!sp->used_retries || !(sp->used_retries % 1000)) log_erros(sp, "used external blocking lockspace removal"); sp->used_retries++; return 0; } if (sp->flags & SP_USED_BY_ORPHANS) { /* * lock ordering: spaces_mutex (main_loop), then * resource_mutex (resource_orphan_count) */ int orphans = resource_orphan_count(sp->space_name); if (orphans) { if (!sp->used_retries || !(sp->used_retries % 1000)) log_erros(sp, "used by orphan %d blocking lockspace removal", orphans); sp->used_retries++; return 0; } } if (sp->renew_fail || sp->used_retries) log_erros(sp, "all pids clear"); else log_space(sp, "all pids clear"); return 1; } static unsigned int time_diff(struct timeval *begin, struct timeval *end) { struct timeval result; timersub(end, begin, &result); return (result.tv_sec * 1000) + (result.tv_usec / 1000); } #define STANDARD_CHECK_INTERVAL 1000 /* milliseconds */ #define RECOVERY_CHECK_INTERVAL 200 /* milliseconds */ static int main_loop(void) { void (*workfn) (int ci); void (*deadfn) (int ci); struct space *sp, *safe; struct timeval now, last_check; int poll_timeout, check_interval; unsigned int ms; int i, rv, empty, check_all; char *check_buf = NULL; int check_buf_len = 0; uint64_t ebuf; gettimeofday(&last_check, NULL); poll_timeout = STANDARD_CHECK_INTERVAL; check_interval = STANDARD_CHECK_INTERVAL; while (1) { /* as well as the clients, check the eventfd */ pollfd[client_maxi+1].fd = efd; pollfd[client_maxi+1].events = POLLIN; rv = poll(pollfd, client_maxi + 2, poll_timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) { /* not sure */ log_client(0, 0, "poll err %d", rv); } for (i = 0; i <= client_maxi + 1; i++) { /* * This index for efd has no client array entry. Its * only purpose is to wake up this poll loop in which * case we just clear any data and continue looking * for other client entries that need processing. */ if (pollfd[i].fd == efd) { if (pollfd[i].revents & POLLIN) { log_client(i, efd, "efd wake"); /* N.B. i is not a ci */ eventfd_read(efd, &ebuf); } continue; } /* * FIXME? client_maxi is never reduced so over time we * end up checking and skipping some number of unused * client entries here which seems inefficient. */ if (client[i].fd < 0) continue; if (pollfd[i].revents & POLLIN) { workfn = client[i].workfn; if (workfn) workfn(i); } if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { log_client(i, client[i].fd, "poll dead"); deadfn = client[i].deadfn; if (deadfn) deadfn(i); } } gettimeofday(&now, NULL); ms = time_diff(&last_check, &now); if (ms < check_interval) { poll_timeout = check_interval - ms; continue; } last_check = now; check_interval = STANDARD_CHECK_INTERVAL; /* * check the condition of each lockspace, * if pids are being killed, have pids all exited? * is its host_id being renewed?, if not kill pids */ pthread_mutex_lock(&spaces_mutex); list_for_each_entry_safe(sp, safe, &spaces, list) { if (sp->killing_pids && all_pids_dead(sp)) { /* * move sp to spaces_rem so main_loop * will no longer see it. */ log_space(sp, "set thread_stop"); pthread_mutex_lock(&sp->mutex); sp->thread_stop = 1; deactivate_watchdog(sp); pthread_mutex_unlock(&sp->mutex); list_move(&sp->list, &spaces_rem); continue; } if (sp->killing_pids) { /* * continue to kill the pids with increasing * levels of severity until they all exit */ kill_pids(sp); check_interval = RECOVERY_CHECK_INTERVAL; continue; } /* * check host_id lease renewal */ if (sp->align_size > check_buf_len) { if (check_buf) free(check_buf); check_buf_len = sp->align_size; check_buf = malloc(check_buf_len); } if (check_buf) memset(check_buf, 0, check_buf_len); check_all = 0; rv = check_our_lease(sp, &check_all, check_buf); if (rv) sp->renew_fail = 1; if (rv || sp->external_remove || (external_shutdown > 1)) { log_space(sp, "set killing_pids check %d remove %d", rv, sp->external_remove); sp->space_dead = 1; sp->killing_pids = 1; kill_pids(sp); check_interval = RECOVERY_CHECK_INTERVAL; } else if (check_all) { check_other_leases(sp, check_buf); } } empty = list_empty(&spaces); pthread_mutex_unlock(&spaces_mutex); if (external_shutdown && empty) break; if (external_shutdown == 1) { log_debug("ignore shutdown, lockspace exists"); external_shutdown = 0; } free_lockspaces(0); rem_resources(); gettimeofday(&now, NULL); ms = time_diff(&last_check, &now); if (ms < check_interval) poll_timeout = check_interval - ms; else poll_timeout = 1; } free_lockspaces(1); daemon_shutdown_reply(); return 0; } static void *thread_pool_worker(void *data) { struct task task; struct cmd_args *ca; memset(&task, 0, sizeof(struct task)); setup_task_aio(&task, main_task.use_aio, WORKER_AIO_CB_SIZE); snprintf(task.name, NAME_ID_SIZE, "worker%ld", (long)data); pthread_mutex_lock(&pool.mutex); while (1) { while (!pool.quit && list_empty(&pool.work_data)) { pool.free_workers++; pthread_cond_wait(&pool.cond, &pool.mutex); pool.free_workers--; } while (!list_empty(&pool.work_data)) { ca = list_first_entry(&pool.work_data, struct cmd_args, list); list_del(&ca->list); pthread_mutex_unlock(&pool.mutex); call_cmd_thread(&task, ca); free(ca); pthread_mutex_lock(&pool.mutex); } if (pool.quit) break; } pool.num_workers--; if (!pool.num_workers) pthread_cond_signal(&pool.quit_wait); pthread_mutex_unlock(&pool.mutex); close_task_aio(&task); return NULL; } static int thread_pool_add_work(struct cmd_args *ca) { pthread_t th; int rv; pthread_mutex_lock(&pool.mutex); if (pool.quit) { pthread_mutex_unlock(&pool.mutex); return -1; } list_add_tail(&ca->list, &pool.work_data); if (!pool.free_workers && pool.num_workers < pool.max_workers) { rv = pthread_create(&th, NULL, thread_pool_worker, (void *)(long)pool.num_workers); if (rv < 0) { log_error("thread_pool_add_work ci %d error %d", ca->ci_in, rv); list_del(&ca->list); pthread_mutex_unlock(&pool.mutex); return rv; } pool.num_workers++; } pthread_cond_signal(&pool.cond); pthread_mutex_unlock(&pool.mutex); return 0; } static void thread_pool_free(void) { pthread_mutex_lock(&pool.mutex); pool.quit = 1; if (pool.num_workers > 0) { pthread_cond_broadcast(&pool.cond); pthread_cond_wait(&pool.quit_wait, &pool.mutex); } pthread_mutex_unlock(&pool.mutex); } static int thread_pool_create(int min_workers, int max_workers) { pthread_t th; int i, rv; memset(&pool, 0, sizeof(pool)); INIT_LIST_HEAD(&pool.work_data); pthread_mutex_init(&pool.mutex, NULL); pthread_cond_init(&pool.cond, NULL); pthread_cond_init(&pool.quit_wait, NULL); pool.max_workers = max_workers; for (i = 0; i < min_workers; i++) { rv = pthread_create(&th, NULL, thread_pool_worker, (void *)(long)i); if (rv < 0) break; pool.num_workers++; } if (rv < 0) thread_pool_free(); return rv; } /* * cmd comes from a transient client/fd set up just to pass the cmd, * and is not being done on behalf of another registered client/fd. * The command is processed independently of the lifetime of a specific * client or the tokens held by a specific client. */ static void process_cmd_thread_unregistered(int ci_in, struct sm_header *h_recv) { struct cmd_args *ca; int rv; ca = malloc(sizeof(struct cmd_args)); if (!ca) { rv = -ENOMEM; goto fail; } ca->ci_in = ci_in; memcpy(&ca->header, h_recv, sizeof(struct sm_header)); snprintf(client[ci_in].owner_name, SANLK_NAME_LEN, "cmd%d", h_recv->cmd); log_client(ci_in, client[ci_in].fd, "process cmd %u", h_recv->cmd); rv = thread_pool_add_work(ca); if (rv < 0) goto fail_free; return; fail_free: free(ca); fail: log_error("cmd %d %d:%d process_unreg error %d", h_recv->cmd, ci_in, client[ci_in].fd, rv); client_recv_all(ci_in, h_recv, 0); send_result(ci_in, client[ci_in].fd, h_recv, rv); client_resume(ci_in); } /* * cmd either comes from a registered client/fd, or is targeting a registered * client/fd. The processing of the cmd is closely coordinated with the * lifetime of a specific client and to tokens held by that client. Handling * of the client's death or changing of the client's tokens will be serialized * with the processing of this command. This means that the end of processing * this command needs to check if the client failed during the command * processing and handle the cleanup of the client if so. */ static void process_cmd_thread_registered(int ci_in, struct sm_header *h_recv) { struct cmd_args *ca; struct client *cl; int result = 0; int rv, i, ci_target; ca = malloc(sizeof(struct cmd_args)); if (!ca) { result = -ENOMEM; goto fail; } if (h_recv->data2 != -1) { /* lease for another registered client with pid specified by data2 */ ci_target = -1; for (i = 0; i < client_size; i++) { cl = &client[i]; pthread_mutex_lock(&cl->mutex); if (cl->pid != h_recv->data2) { pthread_mutex_unlock(&cl->mutex); continue; } ci_target = i; break; } if (ci_target < 0) { if (h_recv->cmd != SM_CMD_INQUIRE) { /* inquire can be used to check if a pid exists */ log_error("cmd %d target pid %d not found", h_recv->cmd, h_recv->data2); } result = -ESRCH; goto fail; } log_client(ci_in, client[ci_in].fd, "process reg cmd %u target pid %d ci %d", h_recv->cmd, h_recv->data2, ci_target); } else { /* lease for this registered client */ log_client(ci_in, client[ci_in].fd, "process reg cmd %u", h_recv->cmd); ci_target = ci_in; cl = &client[ci_target]; pthread_mutex_lock(&cl->mutex); } if (!cl->used) { log_error("cmd %d %d,%d,%d not used", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->pid <= 0) { log_error("cmd %d %d,%d,%d no pid", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->pid_dead) { log_error("cmd %d %d,%d,%d pid_dead", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->need_free) { log_error("cmd %d %d,%d,%d need_free", h_recv->cmd, ci_target, cl->fd, cl->pid); result = -EBUSY; goto out; } if (cl->kill_count && h_recv->cmd == SM_CMD_ACQUIRE) { /* when pid is being killed, we want killpath to be able to inquire and release for it */ log_error("cmd %d %d,%d,%d kill_count %d", h_recv->cmd, ci_target, cl->fd, cl->pid, cl->kill_count); result = -EBUSY; goto out; } if (cl->cmd_active) { if (com.quiet_fail && cl->cmd_active == SM_CMD_ACQUIRE) { result = -EBUSY; goto out; } log_error("cmd %d %d,%d,%d cmd_active %d", h_recv->cmd, ci_target, cl->fd, cl->pid, cl->cmd_active); result = -EBUSY; goto out; } cl->cmd_active = h_recv->cmd; /* once cmd_active is set, client_pid_dead() will not clear cl->tokens or call client_free, so it's the responsiblity of cmd_a,r,i_thread to check if pid_dead when clearing cmd_active, and doing the cleanup if pid is dead */ out: pthread_mutex_unlock(&cl->mutex); if (result < 0) goto fail; ca->ci_in = ci_in; ca->ci_target = ci_target; ca->cl_pid = cl->pid; ca->cl_fd = cl->fd; memcpy(&ca->header, h_recv, sizeof(struct sm_header)); rv = thread_pool_add_work(ca); if (rv < 0) { /* we don't have to worry about client_pid_dead having been called while mutex was unlocked with cmd_active set, because client_pid_dead is called from the main thread which is running this function */ log_error("create cmd thread failed"); pthread_mutex_lock(&cl->mutex); cl->cmd_active = 0; pthread_mutex_unlock(&cl->mutex); result = rv; goto fail; } return; fail: log_error("process_cmd_thread_reg failed ci %d fd %d cmd %u", ci_in, client[ci_in].fd, h_recv->cmd); client_recv_all(ci_in, h_recv, 0); send_result(ci_in, client[ci_in].fd, h_recv, result); client_resume(ci_in); if (ca) free(ca); } static void process_connection(int ci) { struct sm_header h; void (*deadfn)(int ci); int rv; memset(&h, 0, sizeof(h)); rv = recv(client[ci].fd, &h, sizeof(h), MSG_WAITALL); if (!rv) goto dead; log_client(ci, client[ci].fd, "recv %d %d", rv, h.cmd); if (rv < 0) { log_error("ci %d fd %d pid %d recv errno %d", ci, client[ci].fd, client[ci].pid, errno); goto dead; } if (rv != sizeof(h)) { log_error("ci %d fd %d pid %d recv size %d", ci, client[ci].fd, client[ci].pid, rv); goto dead; } if (h.magic != SM_MAGIC) { log_error("ci %d recv %d magic %x vs %x", ci, rv, h.magic, SM_MAGIC); goto dead; } if (client[ci].restricted & SANLK_RESTRICT_ALL) { log_error("ci %d fd %d pid %d cmd %d restrict all", ci, client[ci].fd, client[ci].pid, h.cmd); goto dead; } if (h.version && (h.cmd != SM_CMD_VERSION) && (h.version & 0xFFFF0000) > (SM_PROTO & 0xFFFF0000)) { log_error("ci %d recv %d proto %x vs %x", ci, rv, h.version , SM_PROTO); goto dead; } client[ci].cmd_last = h.cmd; switch (h.cmd) { case SM_CMD_REGISTER: case SM_CMD_RESTRICT: case SM_CMD_VERSION: case SM_CMD_SHUTDOWN: case SM_CMD_STATUS: case SM_CMD_HOST_STATUS: case SM_CMD_RENEWAL: case SM_CMD_LOG_DUMP: case SM_CMD_GET_LOCKSPACES: case SM_CMD_GET_HOSTS: case SM_CMD_REG_EVENT: case SM_CMD_END_EVENT: case SM_CMD_SET_CONFIG: call_cmd_daemon(ci, &h, client_maxi); break; case SM_CMD_ADD_LOCKSPACE: case SM_CMD_INQ_LOCKSPACE: case SM_CMD_REM_LOCKSPACE: case SM_CMD_REQUEST: case SM_CMD_EXAMINE_RESOURCE: case SM_CMD_EXAMINE_LOCKSPACE: case SM_CMD_ALIGN: case SM_CMD_WRITE_LOCKSPACE: case SM_CMD_WRITE_RESOURCE: case SM_CMD_READ_LOCKSPACE: case SM_CMD_READ_RESOURCE: case SM_CMD_READ_RESOURCE_OWNERS: case SM_CMD_SET_LVB: case SM_CMD_GET_LVB: case SM_CMD_SHUTDOWN_WAIT: case SM_CMD_SET_EVENT: case SM_CMD_FORMAT_RINDEX: case SM_CMD_REBUILD_RINDEX: case SM_CMD_UPDATE_RINDEX: case SM_CMD_LOOKUP_RINDEX: case SM_CMD_CREATE_RESOURCE: case SM_CMD_DELETE_RESOURCE: rv = client_suspend(ci); if (rv < 0) goto dead; process_cmd_thread_unregistered(ci, &h); break; case SM_CMD_ACQUIRE: case SM_CMD_RELEASE: case SM_CMD_INQUIRE: case SM_CMD_CONVERT: case SM_CMD_KILLPATH: /* the main_loop needs to ignore this connection while the thread is working on it */ rv = client_suspend(ci); if (rv < 0) goto dead; process_cmd_thread_registered(ci, &h); break; default: log_error("process_connection ci %d fd %d cmd %d unknown", ci, client[ci].fd, h.cmd); goto dead; }; return; dead: log_client(ci, client[ci].fd, "recv dead"); deadfn = client[ci].deadfn; if (deadfn) deadfn(ci); } static void process_listener(int ci GNUC_UNUSED) { int fd; int on = 1; fd = accept(client[ci].fd, NULL, NULL); if (fd < 0) return; setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on)); client_add(fd, process_connection, NULL); } static int setup_listener(void) { struct sockaddr_un addr; int rv, fd, ci; rv = sanlock_socket_address(run_dir, &addr); if (rv < 0) return rv; fd = socket(AF_LOCAL, SOCK_STREAM, 0); if (fd < 0) return fd; unlink(addr.sun_path); rv = bind(fd, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) goto exit_fail; rv = chmod(addr.sun_path, DEFAULT_SOCKET_MODE); if (rv < 0) goto exit_fail; rv = chown(addr.sun_path, com.uid, com.gid); if (rv < 0) { log_error("could not set socket %s permissions: %s", addr.sun_path, strerror(errno)); goto exit_fail; } rv = listen(fd, 5); if (rv < 0) goto exit_fail; fcntl(fd, F_SETFL, fcntl(fd, F_GETFL, 0) | O_NONBLOCK); ci = client_add(fd, process_listener, NULL); if (ci < 0) goto exit_fail; strcpy(client[ci].owner_name, "listener"); return 0; exit_fail: close(fd); return -1; } static void sigterm_handler(int sig GNUC_UNUSED, siginfo_t *info GNUC_UNUSED, void *ctx GNUC_UNUSED) { external_shutdown = 1; } static void setup_priority(void) { struct sched_param sched_param; int rv = 0; if (com.mlock_level == 1) rv = mlockall(MCL_CURRENT); else if (com.mlock_level == 2) rv = mlockall(MCL_CURRENT | MCL_FUTURE); if (rv < 0) { log_error("mlockall %d failed: %s", com.mlock_level, strerror(errno)); } if (!com.high_priority) return; rv = sched_get_priority_max(SCHED_RR); if (rv < 0) { log_error("could not get max scheduler priority err %d", errno); return; } sched_param.sched_priority = rv; rv = sched_setscheduler(0, SCHED_RR|SCHED_RESET_ON_FORK, &sched_param); if (rv < 0) { log_error("set scheduler RR|RESET_ON_FORK priority %d failed: %s", sched_param.sched_priority, strerror(errno)); } } /* return a random int between a and b inclusive */ int get_rand(int a, int b); int get_rand(int a, int b) { long int rv; pthread_mutex_lock(&rand_mutex); rv = random(); pthread_mutex_unlock(&rand_mutex); if (rv < 0) return rv; return a + (int) (((float)(b - a + 1)) * rv / (RAND_MAX+1.0)); } static void setup_host_name(void) { struct utsname name; char uuid[37]; uuid_t uu; int ret; memset(rand_state, 0, sizeof(rand_state)); initstate(time(NULL), rand_state, sizeof(rand_state)); /* use host name from command line */ if (com.our_host_name[0]) { memcpy(our_host_name_global, com.our_host_name, SANLK_NAME_LEN); return; } /* make up something that's likely to be different among hosts */ memset(&our_host_name_global, 0, sizeof(our_host_name_global)); memset(&name, 0, sizeof(name)); memset(&uuid, 0, sizeof(uuid)); uname(&name); uuid_generate(uu); uuid_unparse_lower(uu, uuid); ret = snprintf(our_host_name_global, NAME_ID_SIZE, "%s.", uuid); if (ret < NAME_ID_SIZE) memcpy(our_host_name_global+ret, name.nodename, NAME_ID_SIZE-ret); } static void setup_limits(void) { int rv; struct rlimit rlim = { .rlim_cur = -1, .rlim_max= -1 }; if (!privileged) return; rv = setrlimit(RLIMIT_MEMLOCK, &rlim); if (rv < 0) { log_error("cannot set the limits for memlock %i", errno); exit(EXIT_FAILURE); } rv = setrlimit(RLIMIT_RTPRIO, &rlim); if (rv < 0) { log_error("cannot set the limits for rtprio %i", errno); exit(EXIT_FAILURE); } rv = setrlimit(RLIMIT_CORE, &rlim); if (rv < 0) { log_error("cannot set the limits for core dumps %i", errno); exit(EXIT_FAILURE); } } static void setup_groups(void) { int rv; if (!com.uname || !com.gname || !privileged) return; rv = initgroups(com.uname, com.gid); if (rv < 0) { log_error("error initializing groups errno %i", errno); } } static void setup_uid_gid(void) { int rv; if (!com.uname || !com.gname || !privileged) return; rv = setgid(com.gid); if (rv < 0) { log_error("cannot set group id to %i errno %i", com.gid, errno); } rv = setuid(com.uid); if (rv < 0) { log_error("cannot set user id to %i errno %i", com.uid, errno); } /* When a program is owned by a user (group) other than the real user * (group) ID of the process, the PR_SET_DUMPABLE option gets cleared. * See RLIMIT_CORE in setup_limits and man 5 core. */ rv = prctl(PR_SET_DUMPABLE, 1, 0, 0, 0); if (rv < 0) { log_error("cannot set dumpable process errno %i", errno); } } static void setup_signals(void) { struct sigaction act; int rv, i, sig_list[] = { SIGHUP, SIGINT, SIGTERM, 0 }; memset(&act, 0, sizeof(act)); act.sa_flags = SA_SIGINFO; act.sa_sigaction = sigterm_handler; for (i = 0; sig_list[i] != 0; i++) { rv = sigaction(sig_list[i], &act, NULL); if (rv < 0) { log_error("cannot set the signal handler for: %i", sig_list[i]); exit(EXIT_FAILURE); } } } /* * first pipe for daemon to send requests to helper; they are not acknowledged * and the daemon does not get any result back for the requests. * * second pipe for helper to send general status/heartbeat back to the daemon * every so often to confirm it's not dead/hung. If the helper gets stuck or * killed, the daemon will not get the status and won't bother sending requests * to the helper, and use SIGTERM instead */ static int setup_helper(void) { int pid; int pw_fd = -1; /* parent write */ int cr_fd = -1; /* child read */ int pr_fd = -1; /* parent read */ int cw_fd = -1; /* child write */ int pfd[2]; /* we can't allow the main daemon thread to block */ if (pipe2(pfd, O_NONBLOCK | O_CLOEXEC)) return -errno; /* uncomment for rhel7 where this should be available */ /* fcntl(pfd[1], F_SETPIPE_SZ, 1024*1024); */ cr_fd = pfd[0]; pw_fd = pfd[1]; if (pipe2(pfd, O_NONBLOCK | O_CLOEXEC)) { close(cr_fd); close(pw_fd); return -errno; } pr_fd = pfd[0]; cw_fd = pfd[1]; pid = fork(); if (pid < 0) { close(cr_fd); close(pw_fd); close(pr_fd); close(cw_fd); return -errno; } if (pid) { close(cr_fd); close(cw_fd); helper_kill_fd = pw_fd; helper_status_fd = pr_fd; helper_pid = pid; return 0; } else { close(pr_fd); close(pw_fd); run_helper(cr_fd, cw_fd, (log_stderr_priority == LOG_DEBUG)); exit(0); } } static void process_helper(int ci) { struct helper_status hs; int rv; memset(&hs, 0, sizeof(hs)); rv = read(client[ci].fd, &hs, sizeof(hs)); if (!rv || rv == -EAGAIN) return; if (rv < 0) { log_error("process_helper rv %d errno %d", rv, errno); goto fail; } if (rv != sizeof(hs)) { log_error("process_helper recv size %d", rv); goto fail; } if (hs.type == HELPER_STATUS && !hs.status) helper_last_status = monotime(); return; fail: close_helper(); } static void helper_dead(int ci GNUC_UNUSED) { int pid = helper_pid; int rv, status; close_helper(); helper_pid = -1; rv = waitpid(pid, &status, WNOHANG); if (rv != pid) { /* should not happen */ log_error("helper pid %d dead wait %d", pid, rv); return; } if (WIFEXITED(status)) { log_error("helper pid %d exit status %d", pid, WEXITSTATUS(status)); return; } if (WIFSIGNALED(status)) { log_error("helper pid %d term signal %d", pid, WTERMSIG(status)); return; } /* should not happen */ log_error("helper pid %d state change", pid); } static int do_daemon(void) { struct utsname nodename; int fd, rv; run_dir = env_get(SANLOCK_RUN_DIR, DEFAULT_RUN_DIR); privileged = env_get_bool(SANLOCK_PRIVILEGED, 1); /* This can take a while so do it before forking. */ setup_groups(); if (!com.debug) { /* TODO: copy comprehensive daemonization method from libvirtd */ if (daemon(0, 0) < 0) { log_tool("cannot fork daemon\n"); exit(EXIT_FAILURE); } } setup_limits(); setup_helper(); /* main task never does disk io, so we don't really need to set * it up, but other tasks get their use_aio value by copying * the main_task settings */ sprintf(main_task.name, "%s", "main"); setup_task_aio(&main_task, com.aio_arg, 0); rv = client_alloc(); if (rv < 0) return rv; helper_ci = client_add(helper_status_fd, process_helper, helper_dead); if (helper_ci < 0) return rv; strcpy(client[helper_ci].owner_name, "helper"); setup_signals(); setup_logging(); if (strcmp(run_dir, DEFAULT_RUN_DIR)) log_warn("Using non-standard run directory '%s'", run_dir); if (!privileged) log_warn("Running in unprivileged mode"); /* If we run as root, make run_dir owned by root, so we can create the * lockfile when selinux disables DAC_OVERRIDE. * See https://danwalsh.livejournal.com/79643.html */ fd = lockfile(run_dir, SANLK_LOCKFILE_NAME, com.uid, privileged ? 0 : com.gid); if (fd < 0) { close_logging(); return fd; } setup_host_name(); setup_uid_gid(); uname(&nodename); log_warn("sanlock daemon started %s host %s (%s)", VERSION, our_host_name_global, nodename.nodename); setup_priority(); rv = thread_pool_create(DEFAULT_MIN_WORKER_THREADS, com.max_worker_threads); if (rv < 0) goto out; rv = setup_listener(); if (rv < 0) goto out_threads; setup_token_manager(); if (rv < 0) goto out_threads; /* initialize global eventfd for client_resume notification */ if ((efd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK)) == -1) { log_error("couldn't create eventfd"); goto out_threads; } main_loop(); close_token_manager(); out_threads: thread_pool_free(); out: /* order reversed from setup so lockfile is last */ close_logging(); close(fd); return rv; } static int user_to_uid(char *arg) { struct passwd *pw; pw = getpwnam(arg); if (pw == NULL) { log_error("user '%s' not found, " "using uid: %i", arg, DEFAULT_SOCKET_UID); return DEFAULT_SOCKET_UID; } return pw->pw_uid; } static int group_to_gid(char *arg) { struct group *gr; gr = getgrnam(arg); if (gr == NULL) { log_error("group '%s' not found, " "using uid: %i", arg, DEFAULT_SOCKET_GID); return DEFAULT_SOCKET_GID; } return gr->gr_gid; } static int parse_arg_rentry(char *str) { char *name = NULL; char *offset = NULL; if (!str) return -EINVAL; /* "-r :1M" can be used to specify only an offset */ if (str[0] != ':') name = str; if ((offset = strchr(str, ':'))) { uint64_t offnum; char *m ; *offset = '\0'; offset++; if ((m = strchr(offset, 'M'))) { *m = '\0'; offnum = atoll(offset) * 1024 * 1024; } else { offnum = atoll(offset); } com.rentry.offset = offnum; } if (name) strncpy(com.rentry.name, name, SANLK_NAME_LEN); return 0; } static int parse_arg_rindex(char *str) { char *ls_name = NULL; char *path = NULL; char *offset = NULL; int i; if (!str) return -EINVAL; ls_name = &str[0]; for (i = 0; i < strlen(str); i++) { if (str[i] == '\\') { i++; continue; } if (str[i] == ':') { if (!path) path = &str[i]; else if (!offset) offset = &str[i]; } } if (path) { *path = '\0'; path++; } if (offset) { *offset= '\0'; offset++; } if (ls_name) strncpy(com.rindex.lockspace_name, ls_name, SANLK_NAME_LEN); if (path) sanlock_path_import(com.rindex.disk.path, path, sizeof(com.rindex.disk.path)); if (offset) { uint64_t offnum; char *m ; if ((m = strchr(offset, 'M'))) { *m = '\0'; offnum = atoll(offset) * 1024 * 1024; } else { offnum = atoll(offset); } com.rindex.disk.offset = offnum; } return 0; } /* ::: */ static int parse_arg_lockspace(char *arg) { char offstr[16]; char *colon1, *colon2, *colon3, *m, *p; uint64_t offnum = 0; char *arg2 = NULL; int len = strlen(arg); int len2 = 0; int i; /* * If the arg string uses an offset with the 'M' suffix, then * convert it to a string without 'M'. */ if ((colon1 = strchr(arg, ':'))) { if ((colon2 = strchr(colon1+1, ':'))) { if ((colon3 = strchr(colon2+1, ':'))) { if ((m = strchr(colon3+1, 'M'))) { p = colon3+1; i = 0; while (1) { offstr[i++] = *p; p++; if (p == m) break; } offnum = atoll(offstr) * 1024 * 1024; /* terminate 'arg' before offset */ *colon3 = '\0'; len2 = len + 64; arg2 = malloc(len2); if (!arg2) return -1; memset(arg2, 0, len2); snprintf(arg2, len2, "%s:%llu", arg, (unsigned long long)offnum); } } } } if (arg2) sanlock_str_to_lockspace(arg2, &com.lockspace); else sanlock_str_to_lockspace(arg, &com.lockspace); log_debug("lockspace %s host_id %llu path %s offset %llu", com.lockspace.name, (unsigned long long)com.lockspace.host_id, com.lockspace.host_id_disk.path, (unsigned long long)com.lockspace.host_id_disk.offset); return 0; } /* :::[:] */ static int parse_arg_resource(char *arg) { struct sanlk_resource *res; char offstr[16]; char *colon1, *colon2, *colon3, *colon4, *m, *p; uint64_t offnum = 0; char *arg2 = NULL; int len = strlen(arg); int len2 = 0; int rv, i; if (com.res_count >= SANLK_MAX_RESOURCES) { log_tool("resource args over max %d", SANLK_MAX_RESOURCES); return -1; } memset(offstr, 0, sizeof(offstr)); /* * If the arg string uses an offset with the 'M' suffix, then * convert it to a string without 'M'. */ if ((colon1 = strchr(arg, ':'))) { if ((colon2 = strchr(colon1+1, ':'))) { if ((colon3 = strchr(colon2+1, ':'))) { colon4 = strchr(colon3+1, ':'); /* optional */ if ((m = strchr(colon3+1, 'M'))) { p = colon3+1; i = 0; while (1) { offstr[i++] = *p; p++; if (p == m) break; } offnum = atoll(offstr) * 1024 * 1024; /* terminate 'arg' before offset */ *colon3 = '\0'; len2 = len + 64; arg2 = malloc(len2); if (!arg2) return -1; memset(arg2, 0, len2); if (!colon4) snprintf(arg2, len2, "%s:%llu", arg, (unsigned long long)offnum); else snprintf(arg2, len2, "%s:%llu%s", arg, (unsigned long long)offnum, colon4); } } } } if (arg2) rv = sanlock_str_to_res(arg2, &res); else rv = sanlock_str_to_res(arg, &res); if (rv < 0) { log_tool("resource arg parse error %d\n", rv); return rv; } com.res_args[com.res_count] = res; com.res_count++; log_debug("resource %s %s num_disks %d flags %x lver %llu", res->lockspace_name, res->name, res->num_disks, res->flags, (unsigned long long)res->lver); for (i = 0; i < res->num_disks; i++) { log_debug("resource disk %s %llu", res->disks[i].path, (unsigned long long)res->disks[i].offset); } return 0; } /* * daemon: acquires leases for the local host_id, associates them with a local * pid, and releases them when the associated pid exits. * * client: ask daemon to acquire/release leases associated with a given pid. * * direct: acquires and releases leases directly for the local host_id by * reading and writing storage directly. */ static void print_usage(void) { printf("Usage:\n"); printf("sanlock ...\n\n"); printf("commands:\n"); printf(" daemon start daemon\n"); printf(" client send request to daemon (default type if none given)\n"); printf(" direct access storage directly (no coordination with daemon)\n"); printf(" help print this usage (defaults in parens)\n"); printf(" version print version\n"); printf("\n"); printf("sanlock daemon [options]\n"); printf(" -D no fork and print all logging to stderr\n"); printf(" -Q 0|1 quiet error messages for common lock contention (%d)\n", DEFAULT_QUIET_FAIL); printf(" -R 0|1 renewal debugging, log debug info about renewals (0)\n"); printf(" -H renewal history size (%d)\n", DEFAULT_RENEWAL_HISTORY_SIZE); printf(" -L write logging at priority level and up to logfile (4 LOG_WARNING)\n"); printf(" (use -1 for none)\n"); printf(" -S write logging at priority level and up to syslog (3 LOG_ERR)\n"); printf(" (use -1 for none)\n"); printf(" -U user id\n"); printf(" -G group id\n"); printf(" -t max worker threads (%d)\n", DEFAULT_MAX_WORKER_THREADS); printf(" -g seconds for graceful recovery (%d)\n", DEFAULT_GRACE_SEC); printf(" -w 0|1 use watchdog through wdmd (%d)\n", DEFAULT_USE_WATCHDOG); printf(" -h 0|1 use high priority (RR) scheduling (%d)\n", DEFAULT_HIGH_PRIORITY); printf(" -l use mlockall (0 none, 1 current, 2 current and future) (%d)\n", DEFAULT_MLOCK_LEVEL); printf(" -b seconds a host id bit will remain set in delta lease bitmap\n"); printf(" (default: 6 * io_timeout)\n"); printf(" -e local host name used in delta leases\n"); printf(" (default: generate new uuid)\n"); printf("\n"); printf("sanlock client [options]\n"); printf("sanlock client status [-D] [-o p|s]\n"); printf("sanlock client gets [-h 0|1]\n"); printf("sanlock client host_status -s LOCKSPACE [-D]\n"); printf("sanlock client renewal -s LOCKSPACE\n"); printf("sanlock client set_event -s LOCKSPACE -i [-g gen] -e -d \n"); printf("sanlock client set_config -s LOCKSPACE [-u 0|1] [-O 0|1]\n"); printf("sanlock client log_dump\n"); printf("sanlock client shutdown [-f 0|1] [-w 0|1]\n"); printf("sanlock client init -s LOCKSPACE | -r RESOURCE [-z 0|1] [-Z 512|4096 -A 1M|2M|4M|8M]\n"); printf("sanlock client read -s LOCKSPACE | -r RESOURCE [-D]\n"); printf("sanlock client add_lockspace -s LOCKSPACE\n"); printf("sanlock client inq_lockspace -s LOCKSPACE\n"); printf("sanlock client rem_lockspace -s LOCKSPACE\n"); printf("sanlock client command -r RESOURCE -c \n"); printf("sanlock client acquire -r RESOURCE -p \n"); printf("sanlock client convert -r RESOURCE -p \n"); printf("sanlock client release -r RESOURCE -p \n"); printf("sanlock client inquire -p \n"); printf("sanlock client request -r RESOURCE -f \n"); printf("sanlock client examine -r RESOURCE | -s LOCKSPACE\n"); printf("sanlock client format -x RINDEX [-Z 512|4096 -A 1M|2M|4M|8M]\n"); printf("sanlock client create -x RINDEX -e \n"); printf("sanlock client delete -x RINDEX -e [:]\n"); printf("sanlock client lookup -x RINDEX [-e :]\n"); printf("sanlock client update -x RINDEX -e [:] [-z 0|1]\n"); printf("sanlock client rebuild -x RINDEX\n"); printf("\n"); printf("sanlock direct [-a 0|1] [-o 0|1] [-Z 512|4096 -A 1M|2M|4M|8M]\n"); printf("sanlock direct init -s LOCKSPACE | -r RESOURCE [-Z 512|4096 -A 1M|2M|4M|8M]\n"); printf("sanlock direct read_leader -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock direct dump [:[:]]\n"); printf("sanlock direct format -x RINDEX [-Z 512|4096 -A 1M|2M|4M|8M]\n"); printf("sanlock direct lookup -x RINDEX [-e :]\n"); printf("sanlock direct update -x RINDEX -e [:] [-z 0|1]\n"); printf("sanlock direct rebuild -x RINDEX\n"); printf("\n"); printf("LOCKSPACE = :::\n"); printf(" name of lockspace\n"); printf(" local host identifier in lockspace\n"); printf(" path to storage reserved for leases\n"); printf(" offset on path (bytes)\n"); printf("\n"); printf("RESOURCE = :::[:]\n"); printf(" name of lockspace\n"); printf(" name of resource\n"); printf(" path to storage reserved for leases\n"); printf(" offset on path (bytes)\n"); printf(" optional leader version or SH for shared lease\n"); printf("\n"); printf("RINDEX = ::\n"); printf(" name of lockspace\n"); printf(" path to storage reserved for leases\n"); printf(" offset on path (bytes)\n"); printf("\n"); printf("Limits:\n"); printf("valid sector/align size combinations: 512/1M, 4K/1M, 4K/2M, 4K/4M, 4K/8M\n"); printf("maximum host_id for sector/align sizes: 2000, 250, 500, 1000, 2000\n"); printf("maximum name length for lockspaces and resources: %d\n", SANLK_NAME_LEN); printf("maximum path length: %d\n", SANLK_PATH_LEN); printf("maximum client process connections: 1000\n"); /* NALLOC */ printf("\n"); } static int read_command_line(int argc, char *argv[]) { char optchar; char *optionarg; char *p; char *arg1 = argv[1]; char *act; int i, j, len, sec, begin_command = 0; if (argc < 2 || !strcmp(arg1, "help") || !strcmp(arg1, "--help") || !strcmp(arg1, "-h")) { print_usage(); exit(EXIT_SUCCESS); } if (!strcmp(arg1, "version")) { printf("%u.%u.%u\n", sanlock_version_major, sanlock_version_minor, sanlock_version_patch); exit(EXIT_SUCCESS); } if (!strcmp(arg1, "--version") || !strcmp(arg1, "-V")) { printf("%s %s (built %s %s)\n", argv[0], VERSION, __DATE__, __TIME__); exit(EXIT_SUCCESS); } if (!strcmp(arg1, "daemon")) { com.type = COM_DAEMON; i = 2; } else if (!strcmp(arg1, "direct")) { com.type = COM_DIRECT; if (argc < 3) { print_usage(); exit(EXIT_FAILURE); } act = argv[2]; i = 3; } else if (!strcmp(arg1, "client")) { com.type = COM_CLIENT; if (argc < 3) { print_usage(); exit(EXIT_FAILURE); } act = argv[2]; i = 3; } else { com.type = COM_CLIENT; act = argv[1]; i = 2; } switch (com.type) { case COM_DAEMON: break; case COM_CLIENT: if (!strcmp(act, "status")) com.action = ACT_STATUS; else if (!strcmp(act, "host_status")) com.action = ACT_HOST_STATUS; else if (!strcmp(act, "renewal")) com.action = ACT_RENEWAL; else if (!strcmp(act, "gets")) com.action = ACT_GETS; else if (!strcmp(act, "log_dump")) com.action = ACT_LOG_DUMP; else if (!strcmp(act, "shutdown")) com.action = ACT_SHUTDOWN; else if (!strcmp(act, "add_lockspace")) com.action = ACT_ADD_LOCKSPACE; else if (!strcmp(act, "inq_lockspace")) com.action = ACT_INQ_LOCKSPACE; else if (!strcmp(act, "rem_lockspace")) com.action = ACT_REM_LOCKSPACE; else if (!strcmp(act, "command")) com.action = ACT_COMMAND; else if (!strcmp(act, "acquire")) com.action = ACT_ACQUIRE; else if (!strcmp(act, "convert")) com.action = ACT_CONVERT; else if (!strcmp(act, "release")) com.action = ACT_RELEASE; else if (!strcmp(act, "inquire")) com.action = ACT_INQUIRE; else if (!strcmp(act, "request")) com.action = ACT_REQUEST; else if (!strcmp(act, "examine")) com.action = ACT_EXAMINE; else if (!strcmp(act, "align")) com.action = ACT_CLIENT_ALIGN; else if (!strcmp(act, "init")) com.action = ACT_CLIENT_INIT; else if (!strcmp(act, "write")) com.action = ACT_CLIENT_INIT; else if (!strcmp(act, "read")) com.action = ACT_CLIENT_READ; else if (!strcmp(act, "version")) com.action = ACT_VERSION; else if (!strcmp(act, "set_event")) com.action = ACT_SET_EVENT; else if (!strcmp(act, "set_config")) com.action = ACT_SET_CONFIG; else if (!strcmp(act, "format")) { com.action = ACT_FORMAT; com.rindex_op = RX_OP_FORMAT; } else if (!strcmp(act, "rebuild")) { com.action = ACT_REBUILD; com.rindex_op = RX_OP_REBUILD; } else if (!strcmp(act, "create")) { com.action = ACT_CREATE; com.rindex_op = RX_OP_CREATE; } else if (!strcmp(act, "delete")) { com.action = ACT_DELETE; com.rindex_op = RX_OP_DELETE; } else if (!strcmp(act, "lookup")) { com.action = ACT_LOOKUP; com.rindex_op = RX_OP_LOOKUP; } else if (!strcmp(act, "update")) { com.action = ACT_UPDATE; com.rindex_op = RX_OP_UPDATE; } else { log_tool("client action \"%s\" is unknown", act); exit(EXIT_FAILURE); } break; case COM_DIRECT: if (!strcmp(act, "init")) com.action = ACT_DIRECT_INIT; else if (!strcmp(act, "dump")) com.action = ACT_DUMP; else if (!strcmp(act, "next_free")) com.action = ACT_NEXT_FREE; else if (!strcmp(act, "read_leader")) com.action = ACT_READ_LEADER; else if (!strcmp(act, "write_leader")) com.action = ACT_WRITE_LEADER; else if (!strcmp(act, "acquire")) com.action = ACT_ACQUIRE; else if (!strcmp(act, "release")) com.action = ACT_RELEASE; else if (!strcmp(act, "acquire_id")) com.action = ACT_ACQUIRE_ID; else if (!strcmp(act, "release_id")) com.action = ACT_RELEASE_ID; else if (!strcmp(act, "renew_id")) com.action = ACT_RENEW_ID; else if (!strcmp(act, "format")) { com.action = ACT_FORMAT; com.rindex_op = RX_OP_FORMAT; } else if (!strcmp(act, "rebuild")) { com.action = ACT_REBUILD; com.rindex_op = RX_OP_REBUILD; } else if (!strcmp(act, "lookup")) { com.action = ACT_LOOKUP; com.rindex_op = RX_OP_LOOKUP; } else if (!strcmp(act, "update")) { com.action = ACT_UPDATE; com.rindex_op = RX_OP_UPDATE; } else { log_tool("direct action \"%s\" is unknown", act); exit(EXIT_FAILURE); } break; }; /* actions that have an option without dash-letter prefix */ if (com.action == ACT_DUMP || com.action == ACT_NEXT_FREE) { if (argc < 4) exit(EXIT_FAILURE); optionarg = argv[i++]; com.dump_path = strdup(optionarg); } for (; i < argc; ) { p = argv[i]; if ((p[0] != '-') || (strlen(p) != 2)) { log_tool("unknown option %s", p); log_tool("space required before option value"); exit(EXIT_FAILURE); } optchar = p[1]; i++; /* the only option that does not have optionarg */ if (optchar == 'D') { com.debug = 1; log_stderr_priority = LOG_DEBUG; continue; } if (i >= argc) { log_tool("option '%c' requires arg", optchar); exit(EXIT_FAILURE); } optionarg = argv[i]; switch (optchar) { case 'Q': com.quiet_fail = atoi(optionarg); break; case 'R': com.debug_renew = atoi(optionarg); break; case 'H': com.renewal_history_size = atoi(optionarg); break; case 'L': log_logfile_priority = atoi(optionarg); break; case 'S': log_syslog_priority = atoi(optionarg); break; case 'F': com.file_path = strdup(optionarg); break; case 'a': com.all = atoi(optionarg); com.aio_arg = atoi(optionarg); if (com.aio_arg && com.aio_arg != 1) com.aio_arg = 1; break; case 't': com.max_worker_threads = atoi(optionarg); if (com.max_worker_threads < DEFAULT_MIN_WORKER_THREADS) com.max_worker_threads = DEFAULT_MIN_WORKER_THREADS; break; case 'w': com.use_watchdog = atoi(optionarg); com.wait = atoi(optionarg); break; case 'h': if (com.action == ACT_GETS || com.action == ACT_CLIENT_READ) com.get_hosts = atoi(optionarg); else com.high_priority = atoi(optionarg); break; case 'l': com.mlock_level = atoi(optionarg); break; case 'o': if (com.action == ACT_STATUS) { com.sort_arg = *optionarg; } else { com.io_timeout_arg = atoi(optionarg); if (!com.io_timeout_arg) com.io_timeout_arg = DEFAULT_IO_TIMEOUT; } break; case 'b': com.set_bitmap_seconds = atoi(optionarg); break; case 'n': com.num_hosts = atoi(optionarg); break; case 'm': com.max_hosts = atoi(optionarg); break; case 'p': com.pid = atoi(optionarg); break; case 'd': com.he_data = strtoull(optionarg, NULL, 0); break; case 'e': if (com.rindex_op) { parse_arg_rentry(optionarg); } else { strncpy(com.our_host_name, optionarg, NAME_ID_SIZE); com.he_event = strtoull(optionarg, NULL, 0); } break; case 'i': com.host_id = strtoull(optionarg, NULL, 0); break; case 'g': if (com.type == COM_DAEMON) { sec = atoi(optionarg); if (sec <= 60 && sec >= 0) kill_grace_seconds = sec; } else { com.host_generation = strtoull(optionarg, NULL, 0); } break; case 'f': com.force_mode = strtoul(optionarg, NULL, 0); break; case 's': parse_arg_lockspace(optionarg); /* com.lockspace */ break; case 'r': parse_arg_resource(optionarg); /* com.res_args[] */ break; case 'U': com.uname = optionarg; com.uid = user_to_uid(optionarg); break; case 'G': com.gname = optionarg; com.gid = group_to_gid(optionarg); break; case 'O': com.orphan_set = 1; com.orphan = atoi(optionarg); break; case 'P': com.persistent = atoi(optionarg); break; case 'u': com.used_set = 1; com.used = atoi(optionarg); break; case 'x': parse_arg_rindex(optionarg); break; case 'z': com.clear_arg = 1; break; case 'c': begin_command = 1; break; case 'A': if (!strcmp(optionarg, "1M")) com.align_size = ALIGN_SIZE_1M; else if (!strcmp(optionarg, "2M")) com.align_size = ALIGN_SIZE_2M; else if (!strcmp(optionarg, "4M")) com.align_size = ALIGN_SIZE_4M; else if (!strcmp(optionarg, "8M")) com.align_size = ALIGN_SIZE_8M; break; case 'Z': com.sector_size = atoi(optionarg); break; default: log_tool("unknown option: %c", optchar); exit(EXIT_FAILURE); }; if (begin_command) break; i++; } if (!com.sector_size && !com.align_size) { } else if (com.sector_size && !com.align_size) { } else if (((com.sector_size == 512) && (com.align_size == ALIGN_SIZE_1M)) || ((com.sector_size == 4096) && (com.align_size == ALIGN_SIZE_1M)) || ((com.sector_size == 4096) && (com.align_size == ALIGN_SIZE_2M)) || ((com.sector_size == 4096) && (com.align_size == ALIGN_SIZE_4M)) || ((com.sector_size == 4096) && (com.align_size == ALIGN_SIZE_8M))) { } else { log_tool("Invalid sector_size/align_size combination (%d/%d)", com.sector_size, com.align_size); log_tool("Use one of: 512/1M, 4096/1M, 4096/2M, 4096/4M, 4096/8M."); return -EINVAL; } /* * the remaining args are for the command * * sanlock -r foo -n 2 -d bar:0 -c /bin/cmd -X -Y -Z * argc = 12 * loop above breaks with i = 8, argv[8] = "/bin/cmd" * * cmd_argc = 4 = argc (12) - i (8) * cmd_argv[0] = "/bin/cmd" * cmd_argv[1] = "-X" * cmd_argv[2] = "-Y" * cmd_argv[3] = "-Z" * cmd_argv[4] = NULL (required by execv) */ if (begin_command) { cmd_argc = argc - i; if (cmd_argc < 1) { log_tool("command option (-c) requires an arg"); return -EINVAL; } len = (cmd_argc + 1) * sizeof(char *); /* +1 for final NULL */ cmd_argv = malloc(len); if (!cmd_argv) return -ENOMEM; memset(cmd_argv, 0, len); for (j = 0; j < cmd_argc; j++) { cmd_argv[j] = strdup(argv[i++]); if (!cmd_argv[j]) return -ENOMEM; } strncpy(command, cmd_argv[0], COMMAND_MAX - 1); } return 0; } uint32_t cmd_str_to_num(const char *str) { if (!strcmp(str, "inq_lockspace")) return SM_CMD_INQ_LOCKSPACE; if (!strcmp(str, "read_resource_owners")) return SM_CMD_READ_RESOURCE_OWNERS; if (!strcmp(str, "get_lockspaces")) return SM_CMD_GET_LOCKSPACES; if (!strcmp(str, "get_hosts")) return SM_CMD_GET_HOSTS; if (!strcmp(str, "register")) return SM_CMD_REGISTER; if (!strcmp(str, "add_lockspace")) return SM_CMD_ADD_LOCKSPACE; if (!strcmp(str, "rem_lockspace")) return SM_CMD_REM_LOCKSPACE; if (!strcmp(str, "shutdown")) return SM_CMD_SHUTDOWN; if (!strcmp(str, "status")) return SM_CMD_STATUS; if (!strcmp(str, "acquire")) return SM_CMD_ACQUIRE; if (!strcmp(str, "release")) return SM_CMD_RELEASE; if (!strcmp(str, "inquire")) return SM_CMD_INQUIRE; if (!strcmp(str, "restrict")) return SM_CMD_RESTRICT; if (!strcmp(str, "request")) return SM_CMD_REQUEST; if (!strcmp(str, "align")) return SM_CMD_ALIGN; if (!strcmp(str, "examine_lockspace")) return SM_CMD_EXAMINE_LOCKSPACE; if (!strcmp(str, "examine_resource")) return SM_CMD_EXAMINE_RESOURCE; if (!strcmp(str, "host_status")) return SM_CMD_HOST_STATUS; if (!strcmp(str, "killpath")) return SM_CMD_KILLPATH; if (!strcmp(str, "write_lockspace")) return SM_CMD_WRITE_LOCKSPACE; if (!strcmp(str, "write_resource")) return SM_CMD_WRITE_RESOURCE; if (!strcmp(str, "read_lockspace")) return SM_CMD_READ_LOCKSPACE; if (!strcmp(str, "read_resource")) return SM_CMD_READ_RESOURCE; if (!strcmp(str, "set_lvb")) return SM_CMD_SET_LVB; if (!strcmp(str, "get_lvb")) return SM_CMD_GET_LVB; if (!strcmp(str, "convert")) return SM_CMD_CONVERT; if (!strcmp(str, "version")) return SM_CMD_VERSION; if (!strcmp(str, "shutdown_wait")) return SM_CMD_SHUTDOWN_WAIT; if (!strcmp(str, "reg_event")) return SM_CMD_REG_EVENT; if (!strcmp(str, "end_event")) return SM_CMD_END_EVENT; if (!strcmp(str, "set_event")) return SM_CMD_SET_EVENT; if (!strcmp(str, "set_config")) return SM_CMD_SET_CONFIG; if (!strcmp(str, "renewal")) return SM_CMD_RENEWAL; if (!strcmp(str, "format_rindex")) return SM_CMD_FORMAT_RINDEX; if (!strcmp(str, "update_rindex")) return SM_CMD_UPDATE_RINDEX; if (!strcmp(str, "lookup_rindex")) return SM_CMD_LOOKUP_RINDEX; if (!strcmp(str, "create_resource")) return SM_CMD_CREATE_RESOURCE; if (!strcmp(str, "delete_resource")) return SM_CMD_DELETE_RESOURCE; if (!strcmp(str, "rebuild_rindex")) return SM_CMD_REBUILD_RINDEX; if (!strcmp(str, "log_dump")) return SM_CMD_LOG_DUMP; log_debug("unknown cmd string %.16s", str); return 0; } uint64_t cmd_num_to_debug_flag(uint32_t cmd) { return ((uint64_t)1 << cmd); } int is_cmd_debug(uint32_t cmd) { uint64_t flag = cmd_num_to_debug_flag(cmd); if (com.debug_cmds & flag) return 1; return 0; } void set_cmd_debug(uint32_t cmd) { uint64_t flag = cmd_num_to_debug_flag(cmd); com.debug_cmds |= flag; } void clear_cmd_debug(uint32_t cmd) { uint64_t flag = cmd_num_to_debug_flag(cmd); com.debug_cmds &= ~flag; } #define MAX_CONF_LINE 128 static void get_val_int(char *line, int *val_out) { char key[MAX_CONF_LINE]; char val[MAX_CONF_LINE]; int rv; rv = sscanf(line, "%[^=]=%s", key, val); if (rv != 2) return; *val_out = atoi(val); } static void get_val_str(char *line, char *val_out) { char key[MAX_CONF_LINE]; char val[MAX_CONF_LINE]; int rv; rv = sscanf(line, "%[^=]=%s", key, val); if (rv != 2) return; strcpy(val_out, val); } static void read_config_file(void) { FILE *file; struct stat buf; char line[MAX_CONF_LINE]; char str[MAX_CONF_LINE]; uint32_t cmd; int i, val; if (stat(SANLK_CONF_PATH, &buf) < 0) { if (errno != ENOENT) log_error("%s stat failed: %d", SANLK_CONF_PATH, errno); return; } file = fopen(SANLK_CONF_PATH, "r"); if (!file) return; while (fgets(line, MAX_CONF_LINE, file)) { if (line[0] == '#') continue; if (line[0] == '\n') continue; memset(str, 0, sizeof(str)); for (i = 0; i < MAX_CONF_LINE; i++) { if (line[i] == ' ') break; if (line[i] == '=') break; if (line[i] == '\0') break; if (line[i] == '\n') break; if (line[i] == '\t') break; str[i] = line[i]; } if (!strcmp(str, "quiet_fail")) { get_val_int(line, &val); com.quiet_fail = val; } else if (!strcmp(str, "debug_renew")) { get_val_int(line, &val); com.debug_renew = val; } else if (!strcmp(str, "use_aio")) { get_val_int(line, &val); com.aio_arg = val; } else if (!strcmp(str, "logfile_priority")) { get_val_int(line, &val); log_logfile_priority = val; } else if (!strcmp(str, "logfile_use_utc")) { get_val_int(line, &val); log_logfile_use_utc = val; } else if (!strcmp(str, "syslog_priority")) { get_val_int(line, &val); log_syslog_priority = val; } else if (!strcmp(str, "names_log_priority")) { get_val_int(line, &val); com.names_log_priority = val; } else if (!strcmp(str, "use_watchdog")) { get_val_int(line, &val); com.use_watchdog = val; } else if (!strcmp(str, "high_priority")) { get_val_int(line, &val); com.high_priority = val; } else if (!strcmp(str, "mlock_level")) { get_val_int(line, &val); com.mlock_level = val; } else if (!strcmp(str, "sh_retries")) { get_val_int(line, &val); com.sh_retries = val; } else if (!strcmp(str, "uname")) { memset(str, 0, sizeof(str)); get_val_str(line, str); com.uname = strdup(str); com.uid = user_to_uid(str); } else if (!strcmp(str, "gname")) { memset(str, 0, sizeof(str)); get_val_str(line, str); com.gname = strdup(str); com.gid = group_to_gid(str); } else if (!strcmp(str, "our_host_name")) { memset(str, 0, sizeof(str)); get_val_str(line, str); strncpy(com.our_host_name, str, NAME_ID_SIZE); } else if (!strcmp(str, "renewal_read_extend_sec")) { /* zero is a valid setting so we need the _set field to say it's set */ get_val_int(line, &val); com.renewal_read_extend_sec_set = 1; com.renewal_read_extend_sec = val; } else if (!strcmp(str, "write_init_io_timeout")) { get_val_int(line, &val); if (val > 0) com.write_init_io_timeout = val; } else if (!strcmp(str, "renewal_history_size")) { get_val_int(line, &val); com.renewal_history_size = val; } else if (!strcmp(str, "paxos_debug_all")) { get_val_int(line, &val); com.paxos_debug_all = val; } else if (!strcmp(str, "debug_io")) { memset(str, 0, sizeof(str)); get_val_str(line, str); if (strstr(str, "submit")) com.debug_io_submit = 1; if (strstr(str, "complete")) com.debug_io_complete = 1; } else if (!strcmp(str, "debug_clients")) { get_val_int(line, &val); com.debug_clients = val; } else if (!strcmp(str, "debug_cmd")) { get_val_str(line, str); if (!strcmp(str, "+all")) com.debug_cmds = ~0LL; else if (!strcmp(str, "-all")) com.debug_cmds = 0LL; else { cmd = cmd_str_to_num(str+1); if (cmd && (str[0] == '+')) set_cmd_debug(cmd); else if (cmd && (str[0] == '-')) clear_cmd_debug(cmd); } } else if (!strcmp(str, "max_sectors_kb")) { memset(str, 0, sizeof(str)); get_val_str(line, str); if (strstr(str, "ignore")) { com.max_sectors_kb_ignore = 1; com.max_sectors_kb_align = 0; com.max_sectors_kb_num = 0; } else if (strstr(str, "align")) { com.max_sectors_kb_ignore = 0; com.max_sectors_kb_align = 1; com.max_sectors_kb_num = 0; } else if (isdigit(str[0])) { int num = atoi(str); if (!num || (num % 2) || (num > 8192)) { log_error("ignore invalid num max_sectors_kb %s", str); } else { com.max_sectors_kb_ignore = 0; com.max_sectors_kb_align = 0; com.max_sectors_kb_num = num; } } else { log_error("ignore unknown max_sectors_kb %s", str); } } } fclose(file); } /* only used by do_client */ static char *lsf_to_str(uint32_t flags) { static char lsf_str[16]; memset(lsf_str, 0, 16); if (flags & SANLK_LSF_ADD) strcat(lsf_str, "ADD "); if (flags & SANLK_LSF_REM) strcat(lsf_str, "REM "); return lsf_str; } static const char *host_state_str(uint32_t flags) { int val = flags & SANLK_HOST_MASK; if (val == SANLK_HOST_FREE) return "FREE"; if (val == SANLK_HOST_LIVE) return "LIVE"; if (val == SANLK_HOST_FAIL) return "FAIL"; if (val == SANLK_HOST_DEAD) return "DEAD"; if (val == SANLK_HOST_UNKNOWN) return "UNKNOWN"; return "ERROR"; } static int do_client_gets(void) { struct sanlk_lockspace *lss = NULL, *ls; struct sanlk_host *hss = NULL, *hs; int ls_count = 0, hss_count = 0; int i, j, rv; rv = sanlock_get_lockspaces(&lss, &ls_count, 0); if (rv < 0) log_tool("gets error %d", rv); if (rv < 0 && rv != -ENOSPC) { if (lss) free(lss); return rv; } if (!lss) return 0; ls = lss; for (i = 0; i < ls_count; i++) { log_tool("s %.48s:%llu:%s:%llu %s", ls->name, (unsigned long long)ls->host_id, ls->host_id_disk.path, (unsigned long long)ls->host_id_disk.offset, !ls->flags ? "" : lsf_to_str(ls->flags)); if (!com.get_hosts) goto next; hss = NULL; hss_count = 0; rv = sanlock_get_hosts(ls->name, 0, &hss, &hss_count, 0); if (rv == -EAGAIN) { log_tool("hosts not ready"); goto next; } if (rv < 0) { log_tool("hosts error %d", rv); goto next; } if (!hss) goto next; hs = hss; for (j = 0; j < hss_count; j++) { log_tool("h %llu gen %llu timestamp %llu %s", (unsigned long long)hs->host_id, (unsigned long long)hs->generation, (unsigned long long)hs->timestamp, host_state_str(hs->flags)); hs++; } free(hss); next: ls++; } free(lss); return 0; } static int do_client_read(void) { struct sanlk_host *hss = NULL, *hs; char *res_str = NULL; uint32_t io_timeout = 0; int rv, i, hss_count = 0; if (com.lockspace.host_id_disk.path[0]) { if (com.sector_size) com.lockspace.flags |= sanlk_lsf_sector_size_to_flag(com.sector_size); if (com.align_size) com.lockspace.flags |= sanlk_lsf_align_size_to_flag(com.align_size); rv = sanlock_read_lockspace(&com.lockspace, 0, &io_timeout); } else { if (com.sector_size) com.res_args[0]->flags |= sanlk_res_sector_size_to_flag(com.sector_size); if (com.align_size) com.res_args[0]->flags |= sanlk_res_align_size_to_flag(com.align_size); if (!com.get_hosts) { rv = sanlock_read_resource(com.res_args[0], 0); } else { rv = sanlock_read_resource_owners(com.res_args[0], 0, &hss, &hss_count); } } if (rv < 0) { log_tool("read error %d", rv); goto out; } if (com.lockspace.host_id_disk.path[0]) { log_tool("s %.48s:%llu:%s:%llu", com.lockspace.name, (unsigned long long)com.lockspace.host_id, com.lockspace.host_id_disk.path, (unsigned long long)com.lockspace.host_id_disk.offset); if (com.debug) { log_tool("io_timeout %u", io_timeout); log_tool("sector_size %d", sanlk_lsf_sector_flag_to_size(com.lockspace.flags)); log_tool("align_size %d", sanlk_lsf_align_flag_to_size(com.lockspace.flags)); } goto out; } rv = sanlock_res_to_str(com.res_args[0], &res_str); if (rv < 0) { log_tool("res_to_str error %d", rv); goto out; } log_tool("r %s", res_str); if (com.debug) { log_tool("sector_size %d", sanlk_res_sector_flag_to_size(com.res_args[0]->flags)); log_tool("align_size %d", sanlk_res_align_flag_to_size(com.res_args[0]->flags)); } free(res_str); if (!hss) goto out; hs = hss; for (i = 0; i < hss_count; i++) { if (hs->timestamp) log_tool("h %llu gen %llu timestamp %llu", (unsigned long long)hs->host_id, (unsigned long long)hs->generation, (unsigned long long)hs->timestamp); else log_tool("h %llu gen %llu", (unsigned long long)hs->host_id, (unsigned long long)hs->generation); hs++; } out: if (hss) free(hss); return rv; } static void do_client_version(void) { uint32_t version = 0; uint32_t proto = 0; int rv; rv = sanlock_version(0, &version, &proto); if (rv < 0) { log_tool("daemon version error %d", rv); } log_tool("client version %u.%u.%u (0x%08x)", sanlock_version_major, sanlock_version_minor, sanlock_version_patch, sanlock_version_combined); log_tool("daemon version %u.%u.%u (0x%08x)", (version & 0xFF000000) >> 24, (version & 0x00FF0000) >> 16, (version & 0x0000FF00) >> 8, version); log_tool("client socket protocol %u.%u", (SM_PROTO & 0xFFFF0000) >> 16, (SM_PROTO & 0x0000FFFF)); log_tool("daemon socket protocol %u.%u", (proto & 0xFFFF0000) >> 16, (proto & 0x0000FFFF)); } static int do_client(void) { struct sanlk_host_event he; struct sanlk_resource **res_args = NULL; struct sanlk_resource *res; char *res_state = NULL; uint32_t flags = 0; uint32_t config_cmd = 0; int i, fd; int rv = 0; if (com.action == ACT_COMMAND || com.action == ACT_ACQUIRE) { for (i = 0; i < com.res_count; i++) { res = com.res_args[i]; if (com.num_hosts) { res->flags |= SANLK_RES_NUM_HOSTS; res->data32 = com.num_hosts; } if (com.persistent) res->flags |= SANLK_RES_PERSISTENT; } } switch (com.action) { case ACT_STATUS: rv = sanlock_status(com.debug, com.sort_arg); break; case ACT_HOST_STATUS: rv = sanlock_host_status(com.debug, com.lockspace.name); break; case ACT_RENEWAL: rv = sanlock_renewal(com.lockspace.name); break; case ACT_GETS: rv = do_client_gets(); break; case ACT_LOG_DUMP: rv = sanlock_log_dump(LOG_DUMP_SIZE); break; case ACT_SHUTDOWN: log_tool("shutdown force %d wait %d", com.force_mode, com.wait); rv = sanlock_shutdown(com.force_mode, com.wait); log_tool("shutdown done %d", rv); break; case ACT_COMMAND: log_tool("register"); fd = sanlock_register(); log_tool("register done %d", fd); if (fd < 0) goto out; flags |= com.orphan ? SANLK_ACQUIRE_ORPHAN : 0; log_tool("acquire fd %d", fd); rv = sanlock_acquire(fd, -1, flags, com.res_count, com.res_args, NULL); log_tool("acquire done %d", rv); if (rv < 0) goto out; if (!command[0]) { while (1) sleep(10); } execv(command, cmd_argv); perror("execv failed"); /* release happens automatically when pid exits and daemon detects POLLHUP on registered connection */ break; case ACT_ADD_LOCKSPACE: if (com.io_timeout_arg != DEFAULT_IO_TIMEOUT) { log_tool("add_lockspace_timeout %d", com.io_timeout_arg); rv = sanlock_add_lockspace_timeout(&com.lockspace, 0, com.io_timeout_arg); log_tool("add_lockspace_timeout done %d", rv); } else { log_tool("add_lockspace"); rv = sanlock_add_lockspace(&com.lockspace, 0); log_tool("add_lockspace done %d", rv); } break; case ACT_INQ_LOCKSPACE: log_tool("inq_lockspace"); rv = sanlock_inq_lockspace(&com.lockspace, 0); log_tool("inq_lockspace done %d", rv); break; case ACT_REM_LOCKSPACE: log_tool("rem_lockspace"); rv = sanlock_rem_lockspace(&com.lockspace, 0); log_tool("rem_lockspace done %d", rv); break; case ACT_ACQUIRE: log_tool("acquire pid %d", com.pid); flags |= com.orphan ? SANLK_ACQUIRE_ORPHAN : 0; rv = sanlock_acquire(-1, com.pid, flags, com.res_count, com.res_args, NULL); log_tool("acquire done %d", rv); break; case ACT_CONVERT: log_tool("convert pid %d", com.pid); rv = sanlock_convert(-1, com.pid, 0, com.res_args[0]); log_tool("convert done %d", rv); break; case ACT_RELEASE: log_tool("release pid %d", com.pid); /* * Odd case to specify: release all orphan resources for the named lockspace. * Uses -s lockspace_name instead of using -r, but the function takes a * struct resource, so we take the lockspace arg and copy the name into * a resource struct. When releasing one named orphan resource, the * usual -r lockspace_name:resource_name arg is used. */ if (com.orphan && !com.res_count && com.lockspace.name[0]) { struct sanlk_resource *res_ls = malloc(sizeof(struct sanlk_resource)); if (!res_ls) break; memset(res_ls, 0, sizeof(struct sanlk_resource)); strcpy(res_ls->lockspace_name, com.lockspace.name); com.res_args[0] = res_ls; com.res_count = 1; } flags |= com.orphan ? SANLK_REL_ORPHAN : 0; flags |= com.all ? SANLK_REL_ALL: 0; rv = sanlock_release(-1, com.pid, flags, com.res_count, com.res_args); log_tool("release done %d", rv); break; case ACT_INQUIRE: log_tool("inquire pid %d", com.pid); rv = sanlock_inquire(-1, com.pid, 0, &com.res_count, &res_state); log_tool("inquire done %d res_count %d", rv, com.res_count); if (rv < 0) break; log_tool("\"%s\"", res_state); if (!com.debug) break; com.res_count = 0; rv = sanlock_state_to_args(res_state, &com.res_count, &res_args); log_tool("\nstate_to_args done %d res_count %d", rv, com.res_count); if (rv < 0) break; free(res_state); res_state = NULL; for (i = 0; i < com.res_count; i++) { res = res_args[i]; log_tool("\"%s:%s:%s:%llu:%llu\"", res->lockspace_name, res->name, res->disks[0].path, (unsigned long long)res->disks[0].offset, (unsigned long long)res->lver); } rv = sanlock_args_to_state(com.res_count, res_args, &res_state); log_tool("\nargs_to_state done %d", rv); if (rv < 0) break; log_tool("\"%s\"", res_state); break; case ACT_REQUEST: log_tool("request"); rv = sanlock_request(0, com.force_mode, com.res_args[0]); log_tool("request done %d", rv); break; case ACT_EXAMINE: log_tool("examine"); if (com.lockspace.host_id_disk.path[0]) rv = sanlock_examine(0, &com.lockspace, NULL); else rv = sanlock_examine(0, NULL, com.res_args[0]); log_tool("examine done %d", rv); break; case ACT_CLIENT_ALIGN: log_tool("align"); rv = sanlock_align(&com.lockspace.host_id_disk); log_tool("align done %d", rv); break; case ACT_CLIENT_INIT: log_tool("init"); if (com.lockspace.host_id_disk.path[0]) { if (com.sector_size) com.lockspace.flags |= sanlk_lsf_sector_size_to_flag(com.sector_size); if (com.align_size) com.lockspace.flags |= sanlk_lsf_align_size_to_flag(com.align_size); rv = sanlock_write_lockspace(&com.lockspace, com.max_hosts, 0, com.io_timeout_arg); } else { if (com.sector_size) com.res_args[0]->flags |= sanlk_res_sector_size_to_flag(com.sector_size); if (com.align_size) com.res_args[0]->flags |= sanlk_res_align_size_to_flag(com.align_size); rv = sanlock_write_resource(com.res_args[0], com.max_hosts, com.num_hosts, com.clear_arg ? SANLK_WRITE_CLEAR : 0); } log_tool("init done %d", rv); break; case ACT_CLIENT_READ: rv = do_client_read(); break; case ACT_VERSION: do_client_version(); break; case ACT_SET_EVENT: log_tool("set_event %llu %llu event 0x%llx data 0x%llx", (unsigned long long)com.host_id, (unsigned long long)com.host_generation, (unsigned long long)com.he_event, (unsigned long long)com.he_data); he.host_id = com.host_id; he.generation = com.host_generation; he.event = com.he_event; he.data = com.he_data; rv = sanlock_set_event(com.lockspace.name, &he, 0); log_tool("set_event done %d", rv); break; case ACT_SET_CONFIG: if (com.orphan_set) config_cmd = com.orphan ? SANLK_CONFIG_USED_BY_ORPHANS : SANLK_CONFIG_UNUSED_BY_ORPHANS; else if (com.used_set) config_cmd = com.used ? SANLK_CONFIG_USED : SANLK_CONFIG_UNUSED; log_tool("set_config %.48s %u", com.lockspace.name, config_cmd); rv = sanlock_set_config(com.lockspace.name, 0, config_cmd, NULL); log_tool("set_config done %d", rv); break; case ACT_FORMAT: if (com.sector_size) com.rindex.flags |= sanlk_rif_sector_size_to_flag(com.sector_size); if (com.align_size) com.rindex.flags |= sanlk_rif_align_size_to_flag(com.align_size); rv = sanlock_format_rindex(&com.rindex, 0); log_tool("format done %d", rv); break; case ACT_REBUILD: rv = sanlock_rebuild_rindex(&com.rindex, 0); log_tool("rebuild done %d", rv); break; case ACT_CREATE: rv = sanlock_create_resource(&com.rindex, 0, &com.rentry, 0, 0); log_tool("create_resource done %d", rv); if (!rv) log_tool("offset %llu", (unsigned long long)com.rentry.offset); break; case ACT_DELETE: rv = sanlock_delete_resource(&com.rindex, 0, &com.rentry); log_tool("delete_resource done %d", rv); break; case ACT_LOOKUP: rv = sanlock_lookup_rindex(&com.rindex, 0, &com.rentry); log_tool("lookup done %d", rv); if (!rv) log_tool("name %.48s offset %llu", com.rentry.name[0] ? com.rentry.name : "-", (unsigned long long)com.rentry.offset); break; case ACT_UPDATE: rv = sanlock_update_rindex(&com.rindex, com.clear_arg ? SANLK_RXUP_REM : SANLK_RXUP_ADD, &com.rentry); log_tool("update done %d", rv); break; default: log_tool("action not implemented"); rv = -1; } out: return rv; } #define MAX_LINE 128 static int read_file_leader(struct leader_record *leader, int is_ls) { FILE *file; char line[MAX_LINE]; char field[MAX_LINE]; char val[MAX_LINE]; uint32_t checksum = 0; uint32_t new_checksum; struct leader_record lr; int rv; file = fopen(com.file_path, "r"); if (!file) { log_tool("open error %d %s", errno, com.file_path); return -1; } memcpy(&lr, leader, sizeof(lr)); memset(line, 0, sizeof(line)); while (fgets(line, MAX_LINE, file)) { memset(field, 0, sizeof(field)); memset(val, 0, sizeof(val)); rv = sscanf(line, "%s %s", field, val); if (rv != 2) { log_tool("ignore line: \"%s\"", line); continue; } if (!strcmp(field, "magic")) { sscanf(val, "0x%x", &lr.magic); } else if (!strcmp(field, "version")) { sscanf(val, "0x%x", &lr.version); } else if (!strcmp(field, "flags")) { sscanf(val, "0x%x", &lr.flags); } else if (!strcmp(field, "sector_size")) { sscanf(val, "%u", &lr.sector_size); } else if (!strcmp(field, "num_hosts")) { sscanf(val, "%llu", (unsigned long long *)&lr.num_hosts); } else if (!strcmp(field, "max_hosts")) { sscanf(val, "%llu", (unsigned long long *)&lr.max_hosts); } else if (!strcmp(field, "owner_id")) { sscanf(val, "%llu", (unsigned long long *)&lr.owner_id); } else if (!strcmp(field, "owner_generation")) { sscanf(val, "%llu", (unsigned long long *)&lr.owner_generation); } else if (!strcmp(field, "lver")) { sscanf(val, "%llu", (unsigned long long *)&lr.lver); } else if (!strcmp(field, "space_name")) { strncpy(lr.space_name, val, NAME_ID_SIZE); } else if (!strcmp(field, "resource_name")) { strncpy(lr.resource_name, val, NAME_ID_SIZE); } else if (!strcmp(field, "timestamp")) { sscanf(val, "%llu", (unsigned long long *)&lr.timestamp); } else if (!strcmp(field, "checksum")) { sscanf(val, "0x%x", &checksum); } else if (!strcmp(field, "io_timeout")) { sscanf(val, "%hu", &lr.io_timeout); } else if (is_ls && !strcmp(field, "extra1")) { sscanf(val, "%llu", (unsigned long long *)&lr.write_id); } else if (is_ls && !strcmp(field, "extra2")) { sscanf(val, "%llu", (unsigned long long *)&lr.write_generation); } else if (is_ls && !strcmp(field, "extra3")) { sscanf(val, "%llu", (unsigned long long *)&lr.write_timestamp); } else if (!is_ls && !strcmp(field, "write_id")) { sscanf(val, "%llu", (unsigned long long *)&lr.write_id); } else if (!is_ls && !strcmp(field, "write_generation")) { sscanf(val, "%llu", (unsigned long long *)&lr.write_generation); } else if (!is_ls && !strcmp(field, "write_timestamp")) { sscanf(val, "%llu", (unsigned long long *)&lr.write_timestamp); } else { log_tool("ignore field: \"%s\"", field); } memset(line, 0, sizeof(line)); } fclose(file); new_checksum = leader_checksum(&lr); if (!com.force_mode) { lr.checksum = new_checksum; log_tool("use new generated checksum %x", new_checksum); } else { lr.checksum = checksum; log_tool("warning: using specified checksum %x (generated is %x)", checksum, new_checksum); } memcpy(leader, &lr, sizeof(lr)); return 0; } static void print_leader(struct leader_record *leader, int is_ls) { log_tool("magic 0x%0x", leader->magic); log_tool("version 0x%x", leader->version); log_tool("flags 0x%x", leader->flags); log_tool("sector_size %u", leader->sector_size); log_tool("num_hosts %llu", (unsigned long long)leader->num_hosts); log_tool("max_hosts %llu", (unsigned long long)leader->max_hosts); log_tool("owner_id %llu", (unsigned long long)leader->owner_id); log_tool("owner_generation %llu", (unsigned long long)leader->owner_generation); log_tool("lver %llu", (unsigned long long)leader->lver); log_tool("space_name %.48s", leader->space_name); log_tool("resource_name %.48s", leader->resource_name); log_tool("timestamp %llu", (unsigned long long)leader->timestamp); log_tool("checksum 0x%0x", leader->checksum); log_tool("io_timeout %u", leader->io_timeout); if (!is_ls) { log_tool("write_id %llu", (unsigned long long)leader->write_id); log_tool("write_generation %llu", (unsigned long long)leader->write_generation); log_tool("write_timestamp %llu", (unsigned long long)leader->write_timestamp); } else { log_tool("extra1 %llu", (unsigned long long)leader->write_id); log_tool("extra2 %llu", (unsigned long long)leader->write_generation); log_tool("extra3 %llu", (unsigned long long)leader->write_timestamp); } } static int do_direct_read_leader(void) { struct leader_record leader; int rv; rv = direct_read_leader(&main_task, com.io_timeout_arg, &com.lockspace, com.res_args[0], &leader); log_tool("read_leader done %d", rv); print_leader(&leader, com.res_args[0] ? 0 : 1); return rv; } /* * read the current leader record from disk, override any values found in * the file, write back the result. */ static int do_direct_write_leader(void) { struct leader_record leader; char *res_str = NULL; int is_ls = com.res_args[0] ? 0 : 1; int rv; memset(&leader, 0, sizeof(leader)); direct_read_leader(&main_task, com.io_timeout_arg, &com.lockspace, com.res_args[0], &leader); rv = read_file_leader(&leader, is_ls); if (rv < 0) return rv; /* make a record in the logs that this has been done */ if (is_ls) { syslog(LOG_WARNING, "write_leader lockspace %.48s:%llu:%s:%llu", com.lockspace.name, (unsigned long long)com.lockspace.host_id, com.lockspace.host_id_disk.path, (unsigned long long)com.lockspace.host_id_disk.offset); } else { rv = sanlock_res_to_str(com.res_args[0], &res_str); if (rv < 0) goto out; syslog(LOG_WARNING, "write_leader resource %s", res_str); } rv = direct_write_leader(&main_task, com.io_timeout_arg, &com.lockspace, com.res_args[0], &leader); out: log_tool("write_leader done %d", rv); if (!rv) print_leader(&leader, is_ls); if (res_str) free(res_str); return rv; } static int do_direct_init(void) { char *res_str = NULL; int rv = -EINVAL; if (com.lockspace.host_id_disk.path[0]) { if (com.sector_size) com.lockspace.flags |= sanlk_lsf_sector_size_to_flag(com.sector_size); if (com.align_size) com.lockspace.flags |= sanlk_lsf_align_size_to_flag(com.align_size); syslog(LOG_WARNING, "init lockspace %.48s:%llu:%s:%llu 0x%x", com.lockspace.name, (unsigned long long)com.lockspace.host_id, com.lockspace.host_id_disk.path, (unsigned long long)com.lockspace.host_id_disk.offset, com.lockspace.flags); rv = direct_write_lockspace(&main_task, &com.lockspace, com.io_timeout_arg); } else if (com.res_args[0]) { if (com.sector_size) com.res_args[0]->flags |= sanlk_res_sector_size_to_flag(com.sector_size); if (com.align_size) com.res_args[0]->flags |= sanlk_res_align_size_to_flag(com.align_size); rv = sanlock_res_to_str(com.res_args[0], &res_str); if (rv < 0) { log_tool("res_to_str parsing error %d", rv); goto out; } else { syslog(LOG_WARNING, "init resource %s", res_str); } rv = direct_write_resource(&main_task, com.res_args[0], com.num_hosts, com.clear_arg); } out: log_tool("init done %d", rv); if (res_str) free(res_str); return rv; } static int do_direct(void) { struct leader_record leader; uint32_t cmd_flags = 0; int rv; /* * We want a record of any out-of-band changes to disk in the system * log. If /dev/log is missing, drop the message so it would not be * logged to stderr. */ openlog("sanlock-direct", LOG_PID, LOG_DAEMON); setup_task_aio(&main_task, com.aio_arg, DIRECT_AIO_CB_SIZE); sprintf(main_task.name, "%s", "main_direct"); switch (com.action) { case ACT_DIRECT_INIT: rv = do_direct_init(); break; case ACT_DUMP: rv = direct_dump(&main_task, com.dump_path, com.force_mode); break; case ACT_NEXT_FREE: rv = direct_next_free(&main_task, com.dump_path); break; case ACT_READ_LEADER: rv = do_direct_read_leader(); break; case ACT_WRITE_LEADER: rv = do_direct_write_leader(); break; case ACT_FORMAT: if (com.sector_size) com.rindex.flags |= sanlk_rif_sector_size_to_flag(com.sector_size); if (com.align_size) com.rindex.flags |= sanlk_rif_align_size_to_flag(com.align_size); syslog(LOG_WARNING, "format rindex %.48s:%s:%llu 0x%x", com.rindex.lockspace_name, com.rindex.disk.path, (unsigned long long)com.rindex.disk.offset, com.rindex.flags); rv = direct_rindex_format(&main_task, &com.rindex); log_tool("format done %d", rv); break; case ACT_REBUILD: rv = direct_rindex_rebuild(&main_task, &com.rindex, 0); log_tool("rebuild done %d", rv); break; case ACT_LOOKUP: rv = direct_rindex_lookup(&main_task, &com.rindex, &com.rentry, 0); log_tool("lookup done %d", rv); if (!rv) log_tool("name %.48s offset %llu", com.rentry.name[0] ? com.rentry.name : "-", (unsigned long long)com.rentry.offset); break; case ACT_UPDATE: if (com.clear_arg) cmd_flags |= SANLK_RXUP_REM; else cmd_flags |= SANLK_RXUP_ADD; rv = direct_rindex_update(&main_task, &com.rindex, &com.rentry, cmd_flags); log_tool("update done %d", rv); break; case ACT_ACQUIRE: syslog(LOG_WARNING, "acquire"); rv = direct_acquire(&main_task, com.io_timeout_arg, com.res_args[0], com.num_hosts, com.host_id, com.host_generation, &leader); log_tool("acquire done %d", rv); break; case ACT_RELEASE: syslog(LOG_WARNING, "release"); rv = direct_release(&main_task, com.io_timeout_arg, com.res_args[0], &leader); log_tool("release done %d", rv); break; case ACT_ACQUIRE_ID: syslog(LOG_WARNING, "acquire_id"); setup_host_name(); rv = direct_acquire_id(&main_task, com.io_timeout_arg, &com.lockspace, our_host_name_global); log_tool("acquire_id done %d", rv); break; case ACT_RELEASE_ID: syslog(LOG_WARNING, "release_id"); rv = direct_release_id(&main_task, com.io_timeout_arg, &com.lockspace); log_tool("release_id done %d", rv); break; case ACT_RENEW_ID: syslog(LOG_WARNING, "renew_id"); rv = direct_renew_id(&main_task, com.io_timeout_arg, &com.lockspace); log_tool("rewew_id done %d", rv); break; default: log_tool("direct action %d not known", com.action); rv = -1; } close_task_aio(&main_task); closelog(); return rv; } static void set_sanlock_version(void) { char version_str[64]; char *major_str, *minor_str, *patch_str; char *d1, *d2; strncpy(version_str, VERSION, 64); d1 = strstr(version_str, "."); if (!d1) return; d2 = strstr(d1 + 1, "."); if (!d2) return; major_str = version_str; minor_str = d1 + 1; patch_str = d2 + 1; *d1 = '\0'; *d2 = '\0'; sanlock_version_major = atoi(major_str); sanlock_version_minor = atoi(minor_str); sanlock_version_patch = atoi(patch_str); sanlock_version_build = 0; /* TODO */ sanlock_version_combined = 0; sanlock_version_combined |= sanlock_version_major << 24; sanlock_version_combined |= sanlock_version_minor << 16; sanlock_version_combined |= sanlock_version_patch << 8; sanlock_version_combined |= sanlock_version_build; } int main(int argc, char *argv[]) { int rv; BUILD_BUG_ON(sizeof(struct sanlk_disk) != sizeof(struct sync_disk)); BUILD_BUG_ON(sizeof(struct leader_record) > LEADER_RECORD_MAX); BUILD_BUG_ON(sizeof(struct helper_msg) != SANLK_HELPER_MSG_LEN); /* initialize global EXTERN variables */ set_sanlock_version(); kill_count_max = 100; kill_grace_seconds = DEFAULT_GRACE_SEC; helper_ci = -1; helper_pid = -1; helper_kill_fd = -1; helper_status_fd = -1; pthread_mutex_init(&spaces_mutex, NULL); INIT_LIST_HEAD(&spaces); INIT_LIST_HEAD(&spaces_rem); INIT_LIST_HEAD(&spaces_add); memset(&com, 0, sizeof(com)); com.use_watchdog = DEFAULT_USE_WATCHDOG; com.high_priority = DEFAULT_HIGH_PRIORITY; com.mlock_level = DEFAULT_MLOCK_LEVEL; com.names_log_priority = LOG_WARNING; com.max_worker_threads = DEFAULT_MAX_WORKER_THREADS; com.io_timeout_arg = DEFAULT_IO_TIMEOUT; com.write_init_io_timeout = DEFAULT_WRITE_INIT_IO_TIMEOUT; com.aio_arg = DEFAULT_USE_AIO; com.pid = -1; com.sh_retries = DEFAULT_SH_RETRIES; com.quiet_fail = DEFAULT_QUIET_FAIL; com.renewal_read_extend_sec_set = 0; com.renewal_read_extend_sec = 0; com.renewal_history_size = DEFAULT_RENEWAL_HISTORY_SIZE; com.paxos_debug_all = 0; com.max_sectors_kb_ignore = DEFAULT_MAX_SECTORS_KB_IGNORE; com.max_sectors_kb_align = DEFAULT_MAX_SECTORS_KB_ALIGN; com.max_sectors_kb_num = DEFAULT_MAX_SECTORS_KB_NUM; com.debug_cmds = ~0LL; /* By default disable cmds that often cause too much logging. */ clear_cmd_debug(SM_CMD_INQ_LOCKSPACE); clear_cmd_debug(SM_CMD_GET_LOCKSPACES); clear_cmd_debug(SM_CMD_GET_HOSTS); clear_cmd_debug(SM_CMD_READ_LOCKSPACE); clear_cmd_debug(SM_CMD_READ_RESOURCE); clear_cmd_debug(SM_CMD_READ_RESOURCE_OWNERS); clear_cmd_debug(SM_CMD_WRITE_RESOURCE); if (getgrnam("sanlock") && getpwnam("sanlock")) { com.uname = (char *)"sanlock"; com.gname = (char *)"sanlock"; com.uid = user_to_uid(com.uname); com.gid = group_to_gid(com.uname); } else { com.uname = NULL; com.gname = NULL; com.uid = DEFAULT_SOCKET_UID; com.gid = DEFAULT_SOCKET_GID; } memset(&main_task, 0, sizeof(main_task)); /* * read_config_file() overrides com default settings, * read_command_line() overrides com default settings and * config file settings. */ read_config_file(); rv = read_command_line(argc, argv); if (rv < 0) goto out; switch (com.type) { case COM_DAEMON: rv = do_daemon(); break; case COM_CLIENT: rv = do_client(); break; case COM_DIRECT: rv = do_direct(); break; }; out: return rv == 0 ? EXIT_SUCCESS : EXIT_FAILURE; } sanlock-3.8.2/src/mode_block.h000066400000000000000000000010221371427612200162140ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __MODE_BLOCK_H__ #define __MODE_BLOCK_H__ #define MBLOCK_OFFSET 128 /* include paxos_dblock plus padding */ #define MBLOCK_SHARED 0x00000001 struct mode_block { uint32_t flags; uint64_t generation; }; #endif sanlock-3.8.2/src/monotime.c000066400000000000000000000014541371427612200157510ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include "monotime.h" uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } void ts_diff(struct timespec *begin, struct timespec *end, struct timespec *diff) { if ((end->tv_nsec - begin->tv_nsec) < 0) { diff->tv_sec = end->tv_sec - begin->tv_sec - 1; diff->tv_nsec = end->tv_nsec - begin->tv_nsec + 1000000000; } else { diff->tv_sec = end->tv_sec - begin->tv_sec; diff->tv_nsec = end->tv_nsec - begin->tv_nsec; } } sanlock-3.8.2/src/monotime.h000066400000000000000000000006561371427612200157610ustar00rootroot00000000000000/* * Copyright 2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __MONOTIME_H__ #define __MONOTIME_H__ uint64_t monotime(void); void ts_diff(struct timespec *begin, struct timespec *end, struct timespec *diff); #endif sanlock-3.8.2/src/ondisk.c000066400000000000000000000140531371427612200154100ustar00rootroot00000000000000/* * Copyright 2014 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include "sanlock_internal.h" #include "ondisk.h" /* * "end" variables point to ondisk format (endian converted) structures. */ void magic_in(char *end, uint32_t *magic) { uint32_t magic_end; memcpy(&magic_end, end, sizeof(uint32_t)); *magic = le32_to_cpu(magic_end); } void leader_record_in(struct leader_record *end, struct leader_record *lr) { lr->magic = le32_to_cpu(end->magic); lr->version = le32_to_cpu(end->version); lr->flags = le32_to_cpu(end->flags); lr->sector_size = le32_to_cpu(end->sector_size); lr->num_hosts = le64_to_cpu(end->num_hosts); lr->max_hosts = le64_to_cpu(end->max_hosts); lr->owner_id = le64_to_cpu(end->owner_id); lr->owner_generation = le64_to_cpu(end->owner_generation); lr->lver = le64_to_cpu(end->lver); memcpy(lr->space_name, end->space_name, NAME_ID_SIZE); memcpy(lr->resource_name, end->resource_name, NAME_ID_SIZE); lr->timestamp = le64_to_cpu(end->timestamp); lr->unused1 = le64_to_cpu(end->unused1); lr->checksum = le32_to_cpu(end->checksum); lr->unused2 = le16_to_cpu(end->unused2); lr->io_timeout = le16_to_cpu(end->io_timeout); lr->write_id = le64_to_cpu(end->write_id); lr->write_generation = le64_to_cpu(end->write_generation); lr->write_timestamp = le64_to_cpu(end->write_timestamp); } void leader_record_out(struct leader_record *lr, struct leader_record *end) { end->magic = cpu_to_le32(lr->magic); end->version = cpu_to_le32(lr->version); end->flags = cpu_to_le32(lr->flags); end->sector_size = cpu_to_le32(lr->sector_size); end->num_hosts = cpu_to_le64(lr->num_hosts); end->max_hosts = cpu_to_le64(lr->max_hosts); end->owner_id = cpu_to_le64(lr->owner_id); end->owner_generation = cpu_to_le64(lr->owner_generation); end->lver = cpu_to_le64(lr->lver); memcpy(end->space_name, lr->space_name, NAME_ID_SIZE); memcpy(end->resource_name, lr->resource_name, NAME_ID_SIZE); end->timestamp = cpu_to_le64(lr->timestamp); end->unused1 = cpu_to_le64(lr->unused1); /* N.B. the checksum must be computed after the byte swapping */ /* leader_record_out(lr, end); checksum = compute(end); end->checksum = cpu_to_le32(checksum); */ end->unused2 = cpu_to_le16(lr->unused2); end->io_timeout = cpu_to_le16(lr->io_timeout); end->write_id = cpu_to_le64(lr->write_id); end->write_generation = cpu_to_le64(lr->write_generation); end->write_timestamp = cpu_to_le64(lr->write_timestamp); } void request_record_in(struct request_record *end, struct request_record *rr) { rr->magic = le32_to_cpu(end->magic); rr->version = le32_to_cpu(end->version); rr->lver = le64_to_cpu(end->lver); rr->force_mode = le32_to_cpu(end->force_mode); } void request_record_out(struct request_record *rr, struct request_record *end) { end->magic = cpu_to_le32(rr->magic); end->version = cpu_to_le32(rr->version); end->lver = cpu_to_le64(rr->lver); end->force_mode = cpu_to_le32(rr->force_mode); } void paxos_dblock_in(struct paxos_dblock *end, struct paxos_dblock *pd) { pd->mbal = le64_to_cpu(end->mbal); pd->bal = le64_to_cpu(end->bal); pd->inp = le64_to_cpu(end->inp); pd->inp2 = le64_to_cpu(end->inp2); pd->inp3 = le64_to_cpu(end->inp3); pd->lver = le64_to_cpu(end->lver); pd->checksum = le32_to_cpu(end->checksum); pd->flags = le32_to_cpu(end->flags); } void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end) { end->mbal = cpu_to_le64(pd->mbal); end->bal = cpu_to_le64(pd->bal); end->inp = cpu_to_le64(pd->inp); end->inp2 = cpu_to_le64(pd->inp2); end->inp3 = cpu_to_le64(pd->inp3); end->lver = cpu_to_le64(pd->lver); /* N.B. the checksum must be computed after the byte swapping */ /* paxos_dblock_out(pd, end); checksum = compute(end), end->checksum = cpu_to_le32(checksum); */ end->flags = cpu_to_le32(pd->flags); } void mode_block_in(struct mode_block *end, struct mode_block *mb) { mb->flags = le32_to_cpu(end->flags); mb->generation = le64_to_cpu(end->generation); } void mode_block_out(struct mode_block *mb, struct mode_block *end) { end->flags = cpu_to_le32(mb->flags); end->generation = cpu_to_le64(mb->generation); } void rindex_header_in(struct rindex_header *end, struct rindex_header *rh) { rh->magic = le32_to_cpu(end->magic); rh->version = le32_to_cpu(end->version); rh->flags = le32_to_cpu(end->flags); rh->sector_size = le32_to_cpu(end->sector_size); rh->max_resources = le32_to_cpu(end->max_resources); rh->unused = le32_to_cpu(end->unused); rh->rx_offset = le64_to_cpu(end->rx_offset); memcpy(rh->lockspace_name, end->lockspace_name, NAME_ID_SIZE); } void rindex_header_out(struct rindex_header *rh, struct rindex_header *end) { end->magic = cpu_to_le32(rh->magic); end->version = cpu_to_le32(rh->version); end->flags = cpu_to_le32(rh->flags); end->sector_size = cpu_to_le32(rh->sector_size); end->max_resources = cpu_to_le32(rh->max_resources); end->unused = cpu_to_le32(rh->unused); end->rx_offset = cpu_to_le64(rh->rx_offset); memcpy(end->lockspace_name, rh->lockspace_name, NAME_ID_SIZE); } void rindex_entry_in(struct rindex_entry *end, struct rindex_entry *re) { re->res_offset = le64_to_cpu(end->res_offset); re->flags = le32_to_cpu(end->flags); re->unused = le32_to_cpu(end->unused); memcpy(re->name, end->name, NAME_ID_SIZE); } void rindex_entry_out(struct rindex_entry *re, struct rindex_entry *end) { end->res_offset = cpu_to_le64(re->res_offset); end->flags = cpu_to_le32(re->flags); end->unused = cpu_to_le32(re->unused); memcpy(end->name, re->name, NAME_ID_SIZE); } sanlock-3.8.2/src/ondisk.h000066400000000000000000000035131371427612200154140ustar00rootroot00000000000000/* * Copyright 2014 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __ONDISK_H__ #define __ONDISK_H__ #include #include /* * sanlock ondisk format is little endian. */ #if __BYTE_ORDER == __BIG_ENDIAN #define le16_to_cpu(x) (bswap_16((x))) #define le32_to_cpu(x) (bswap_32((x))) #define le64_to_cpu(x) (bswap_64((x))) #define cpu_to_le16(x) (bswap_16((x))) #define cpu_to_le32(x) (bswap_32((x))) #define cpu_to_le64(x) (bswap_64((x))) #endif #if __BYTE_ORDER == __LITTLE_ENDIAN #define le16_to_cpu(x) (x) #define le32_to_cpu(x) (x) #define le64_to_cpu(x) (x) #define cpu_to_le16(x) (x) #define cpu_to_le32(x) (x) #define cpu_to_le64(x) (x) #endif void magic_in(char *end, uint32_t *magic); void leader_record_in(struct leader_record *end, struct leader_record *lr); void leader_record_out(struct leader_record *lr, struct leader_record *end); void request_record_in(struct request_record *end, struct request_record *rr); void request_record_out(struct request_record *rr, struct request_record *end); void paxos_dblock_in(struct paxos_dblock *end, struct paxos_dblock *pd); void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end); void mode_block_in(struct mode_block *end, struct mode_block *mb); void mode_block_out(struct mode_block *mb, struct mode_block *end); void rindex_header_in(struct rindex_header *end, struct rindex_header *rh); void rindex_header_out(struct rindex_header *rh, struct rindex_header *end); void rindex_entry_in(struct rindex_entry *end, struct rindex_entry *re); void rindex_entry_out(struct rindex_entry *re, struct rindex_entry *end); #endif sanlock-3.8.2/src/paxos_dblock.h000066400000000000000000000020131371427612200165670ustar00rootroot00000000000000/* * Copyright 2014 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __PAXOS_DBLOCK_H__ #define __PAXOS_DBLOCK_H__ /* The first dblock (for host_id 1) is in the third sector of a paxos lease. The first sector holds the leader record, and the second sector holds the request record. */ #define DBLOCK_CHECKSUM_LEN 48 /* ends before checksum field */ #define DBLOCK_FL_RELEASED 0x00000001 struct paxos_dblock { uint64_t mbal; uint64_t bal; uint64_t inp; /* host_id */ uint64_t inp2; /* host_id generation */ uint64_t inp3; /* host_id's timestamp */ uint64_t lver; uint32_t checksum; uint32_t flags; /* DBLOCK_FL_ */ }; /* * This struct cannot grow any larger than MBLOCK_OFFSET (128) * because the mode_block starts at that offset in the same sector. */ #endif sanlock-3.8.2/src/paxos_lease.c000066400000000000000000002175441371427612200164360ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "diskio.h" #include "ondisk.h" #include "direct.h" #include "log.h" #include "lockspace.h" #include "delta_lease.h" #include "paxos_lease.h" #include "resource.h" #include "timeouts.h" uint32_t crc32c(uint32_t crc, uint8_t *data, size_t length); int get_rand(int a, int b); /* * BK_DEBUG_SIZE: size of buffer to hold ballot debug info, * this can't be larger than LOG_STR_LEN 512 * BK_STR_SIZE: the max length of a dblock string for one host * BK_DEBUG_COUNT: the max number of hosts for which we'll copy * dblock info * * BK_DEBUG_COUNT * BK_STR_SIZE + extra debug text that comes before * the dblock info needs to be less than BK_DEBUG_SIZE. * Be very careful about increasing BK_DEBUG_COUNT because the use * of strncat depends on it. */ #define BK_DEBUG_SIZE 512 #define BK_DEBUG_COUNT 4 #define BK_STR_SIZE 80 static uint32_t roundup_power_of_two(uint32_t val) { val--; val |= val >> 1; val |= val >> 2; val |= val >> 4; val |= val >> 8; val |= val >> 16; val++; return val; } uint32_t leader_checksum(struct leader_record *lr) { return crc32c((uint32_t)~1, (uint8_t *)lr, LEADER_CHECKSUM_LEN); } uint32_t dblock_checksum(struct paxos_dblock *pd) { return crc32c((uint32_t)~1, (uint8_t *)pd, DBLOCK_CHECKSUM_LEN); } int paxos_lease_request_read(struct task *task, struct token *token, struct request_record *rr) { struct request_record rr_end; int rv; /* 1 = request record is second sector */ rv = read_sectors(&token->disks[0], token->sector_size, 1, 1, (char *)&rr_end, sizeof(struct request_record), task, token->io_timeout, "request"); if (rv < 0) return rv; request_record_in(&rr_end, rr); return SANLK_OK; } int paxos_lease_request_write(struct task *task, struct token *token, struct request_record *rr) { struct request_record rr_end; int rv; request_record_out(rr, &rr_end); rv = write_sector(&token->disks[0], token->sector_size, 1, (char *)&rr_end, sizeof(struct request_record), task, token->io_timeout, "request"); if (rv < 0) return rv; return SANLK_OK; } static int write_dblock(struct task *task, struct token *token, struct sync_disk *disk, uint64_t host_id, struct paxos_dblock *pd); int paxos_erase_dblock(struct task *task, struct token *token, uint64_t host_id) { struct paxos_dblock dblock_end; int num_disks = token->r.num_disks; int num_writes = 0; int d, rv, error = -1; memset(&dblock_end, 0, sizeof(struct paxos_dblock)); for (d = 0; d < num_disks; d++) { rv = write_dblock(task, token, &token->disks[d], host_id, &dblock_end); if (rv < 0) { error = rv; continue; } num_writes++; } if (!majority_disks(num_disks, num_writes)) return error; return SANLK_OK; } /* * Write a combined dblock and mblock. This is an odd case that doesn't fit * well with the way the code has been written. It's used when we want to * convert sh to ex, which requires acquiring the lease owner, but we don't * want to clobber our SHARED mblock by writing a plain dblock in the process * in case there's a problem with the acquiring, we don't want to loose our * shared mode lease. * * NB. this assumes the only mblock flag we want is MBLOCK_SHARED and that * the generation we want is token->host_generation. This is currently * the case, but could change in the future. */ static int write_dblock_mblock_sh(struct task *task, struct token *token, struct sync_disk *disk, uint64_t host_id, struct paxos_dblock *pd) { struct paxos_dblock pd_end; struct mode_block mb; struct mode_block mb_end; char *iobuf, **p_iobuf; uint64_t offset; uint32_t checksum; int iobuf_len, rv, sector_size; memset(&mb, 0, sizeof(mb)); mb.flags = MBLOCK_SHARED; mb.generation = token->host_generation; sector_size = token->sector_size; iobuf_len = sector_size; if (!iobuf_len) return -EINVAL; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return -ENOMEM; offset = disk->offset + ((2 + host_id - 1) * sector_size); paxos_dblock_out(pd, &pd_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = dblock_checksum(&pd_end); pd->checksum = checksum; pd_end.checksum = cpu_to_le32(checksum); mode_block_out(&mb, &mb_end); memcpy(iobuf, (char *)&pd_end, sizeof(struct paxos_dblock)); memcpy(iobuf + MBLOCK_OFFSET, (char *)&mb_end, sizeof(struct mode_block)); rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout, NULL); if (rv < 0) { log_errot(token, "write_dblock_mblock_sh host_id %llu gen %llu rv %d", (unsigned long long)host_id, (unsigned long long)token->host_generation, rv); } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } static int write_dblock(struct task *task, struct token *token, struct sync_disk *disk, uint64_t host_id, struct paxos_dblock *pd) { struct paxos_dblock pd_end; uint32_t checksum; int rv; if (token->flags & T_WRITE_DBLOCK_MBLOCK_SH) { /* special case to preserve our SH mode block within the dblock */ return write_dblock_mblock_sh(task, token, disk, host_id, pd); } /* 1 leader block + 1 request block; host_id N is block offset N-1 */ paxos_dblock_out(pd, &pd_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = dblock_checksum(&pd_end); pd->checksum = checksum; pd_end.checksum = cpu_to_le32(checksum); rv = write_sector(disk, token->sector_size, 2 + host_id - 1, (char *)&pd_end, sizeof(struct paxos_dblock), task, token->io_timeout, "dblock"); return rv; } static int write_leader(struct task *task, struct token *token, struct sync_disk *disk, struct leader_record *lr) { struct leader_record lr_end; uint32_t checksum; int rv; leader_record_out(lr, &lr_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&lr_end); lr->checksum = checksum; lr_end.checksum = cpu_to_le32(checksum); rv = write_sector(disk, token->sector_size, 0, (char *)&lr_end, sizeof(struct leader_record), task, token->io_timeout, "leader"); return rv; } /* * NB. this should not be used to write the leader record, it is meant only * for manually clobbering the disk to corrupt it for testing, or to manually * repair it after it's corrupted. */ int paxos_lease_leader_clobber(struct task *task, struct token *token, struct leader_record *leader, const char *caller) { struct leader_record lr_end; uint32_t checksum; int rv; leader_record_out(leader, &lr_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&lr_end); leader->checksum = checksum; lr_end.checksum = cpu_to_le32(checksum); rv = write_sector(&token->disks[0], token->sector_size, 0, (char *)&lr_end, sizeof(struct leader_record), task, token->io_timeout, caller); return rv; } static int read_dblock(struct task *task, struct token *token, struct sync_disk *disk, uint64_t host_id, struct paxos_dblock *pd) { struct paxos_dblock pd_end; int rv; /* 1 leader block + 1 request block; host_id N is block offset N-1 */ rv = read_sectors(disk, token->sector_size, 2 + host_id - 1, 1, (char *)&pd_end, sizeof(struct paxos_dblock), task, token->io_timeout, "dblock"); paxos_dblock_in(&pd_end, pd); return rv; } #if 0 static int read_dblocks(struct task *task, struct sync_disk *disk, struct paxos_dblock *pds, int pds_count) { struct paxos_dblock pd_end; char *data; int data_len, rv, i; data_len = pds_count * sector_size; data = malloc(data_len); if (!data) { log_error("read_dblocks malloc %d %s", data_len, disk->path); rv = -ENOMEM; goto out; } /* 2 = 1 leader block + 1 request block */ rv = read_sectors(disk, token->sector_size, 2, pds_count, data, data_len, task, "dblocks"); if (rv < 0) goto out_free; /* copy the first N bytes from each sector, where N is size of paxos_dblock */ for (i = 0; i < pds_count; i++) { memcpy(&pd_end, data + (i * sector_size), sizeof(struct paxos_dblock)); paxos_dblock_in(&pd_end, &pd); memcpy(&pds[i], &pd, sizeof(struct paxos_dblock)); } rv = 0; out_free: free(data); out: return rv; } #endif static int read_leader(struct task *task, struct token *token, struct sync_disk *disk, struct leader_record *lr, uint32_t *checksum) { struct leader_record lr_end; int rv; if (!token->sector_size) { log_errot(token, "paxos read_leader with zero sector_size"); return -EINVAL; } /* 0 = leader record is first sector */ rv = read_sectors(disk, token->sector_size, 0, 1, (char *)&lr_end, sizeof(struct leader_record), task, token->io_timeout, "leader"); /* N.B. checksum is computed while the data is in ondisk format. */ *checksum = leader_checksum(&lr_end); leader_record_in(&lr_end, lr); return rv; } static int verify_dblock(struct token *token, struct paxos_dblock *pd, uint32_t checksum) { if (!pd->checksum && !pd->mbal && !pd->bal && !pd->inp && !pd->lver) return SANLK_OK; if (pd->checksum != checksum) { log_errot(token, "verify_dblock wrong checksum %x %x", pd->checksum, checksum); return SANLK_DBLOCK_CHECKSUM; } return SANLK_OK; } /* * It's possible that we pick a bk_max from another host which has our own * inp values in it, and we can end up commiting our own inp values, copied * from another host's dblock: * * host2 leader free * host2 phase1 mbal 14002 * host2 writes dblock[1] mbal 14002 * host2 reads no higher mbal * host2 choose own inp 2,1 * host2 phase2 mbal 14002 bal 14002 inp 2,1 * host2 writes dblock[1] bal 14002 inp 2,1 * host1 leader free * host1 phase1 mbal 20001 * host1 writes dblock[0] mbal 20001 * host1 reads no higher mbal * host1 choose dblock[1] bal 14002 inp 2,1 * host1 phase2 mbal 20001 bal 20001 inp 2,1 * host1 writes dblock[0] bal 20001 inp 2,1 * host2 reads dblock[0] mbal 20001 > 14002 * abort2, retry * host2 leader free * host2 phase1 mbal 16002 * host2 writes dblock[1] mbal 16002 * host2 reads dblock[0] mbal 20001 > 16002 * abort1 retry * host2 leader free * host2 phase1 mbal 18002 * host2 writes dblock[1] mbal 18002 * host2 reads dblock[0] mbal 20001 > 18002 * abort1 retry * host2 leader free * host2 phase1 mbal 20002 * host2 writes dblock[1] mbal 20002 * host2 reads no higher mbal * host2 choose dblock[0] bal 20001 inp 2,1 * host1 reads dblock[1] mbal 20002 > 20001 * abort2 retry * host2 phase2 mbal 20002 bal 20002 inp 2,1 * host2 writes dblock[1] bal 20002 inp 2,1 * host2 reads no higher mbal * host2 commit inp 2,1 * host2 success * host1 leader owner 2,1 * host1 fail */ static int run_ballot(struct task *task, struct token *token, uint32_t flags, int num_hosts, uint64_t next_lver, uint64_t our_mbal, struct paxos_dblock *dblock_out) { char bk_debug[BK_DEBUG_SIZE]; char bk_str[BK_STR_SIZE]; int bk_debug_count; struct paxos_dblock dblock; struct paxos_dblock bk_in; struct paxos_dblock bk_max; struct paxos_dblock *bk_end; struct paxos_dblock *bk; struct sync_disk *disk; char *iobuf[SANLK_MAX_DISKS]; char **p_iobuf[SANLK_MAX_DISKS]; uint32_t checksum; int num_disks = token->r.num_disks; int num_writes, num_reads; int sector_size = token->sector_size; int sector_count; int iobuf_len; int phase2 = 0; int d, q, rv = 0; int q_max = -1; int error; sector_count = roundup_power_of_two(num_hosts + 2); iobuf_len = sector_count * sector_size; if (!iobuf_len) return -EINVAL; for (d = 0; d < num_disks; d++) { p_iobuf[d] = &iobuf[d]; rv = posix_memalign((void *)p_iobuf[d], getpagesize(), iobuf_len); if (rv) return rv; } /* * phase 1 * * "For each disk d, it tries first to write dblock[p] to disk[d][p] * and then to read disk[d][q] for all other processors q. It aborts * the ballot if, for any d and q, it finds disk[d][q].mbal > * dblock[p].mbal. The phase completes when p has written and read a * majority of the disks, without reading any block whose mbal * component is greater than dblock[p].mbal." */ log_token(token, "ballot %llu phase1 write mbal %llu", (unsigned long long)next_lver, (unsigned long long)our_mbal); memset(&dblock, 0, sizeof(struct paxos_dblock)); dblock.mbal = our_mbal; dblock.lver = next_lver; dblock.checksum = 0; /* set after paxos_dblock_out */ memset(&bk_max, 0, sizeof(struct paxos_dblock)); num_writes = 0; for (d = 0; d < num_disks; d++) { /* acquire io: write 1 */ rv = write_dblock(task, token, &token->disks[d], token->host_id, &dblock); if (rv < 0) continue; num_writes++; } if (!majority_disks(num_disks, num_writes)) { log_errot(token, "ballot %llu dblock write error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_WRITE; goto out; } memset(bk_debug, 0, sizeof(bk_debug)); bk_debug_count = 0; num_reads = 0; for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; if (!iobuf[d]) continue; memset(iobuf[d], 0, iobuf_len); /* acquire io: read 2 */ rv = read_iobuf(disk->fd, disk->offset, iobuf[d], iobuf_len, task, token->io_timeout, NULL); if (rv == SANLK_AIO_TIMEOUT) iobuf[d] = NULL; if (rv < 0) continue; num_reads++; for (q = 0; q < num_hosts; q++) { bk_end = (struct paxos_dblock *)(iobuf[d] + ((2 + q)*sector_size)); checksum = dblock_checksum(bk_end); paxos_dblock_in(bk_end, &bk_in); bk = &bk_in; if (bk_in.mbal && ((flags & PAXOS_ACQUIRE_DEBUG_ALL) || (bk_in.lver >= dblock.lver))) { if (bk_debug_count >= BK_DEBUG_COUNT) { log_token(token, "ballot %llu phase1 read %s", (unsigned long long)next_lver, bk_debug); memset(bk_debug, 0, sizeof(bk_debug)); bk_debug_count = 0; } memset(bk_str, 0, sizeof(bk_str)); snprintf(bk_str, BK_STR_SIZE, "%d:%llu:%llu:%llu:%llu:%llu:%llu:%x,", q, (unsigned long long)bk_in.mbal, (unsigned long long)bk_in.bal, (unsigned long long)bk_in.inp, (unsigned long long)bk_in.inp2, (unsigned long long)bk_in.inp3, (unsigned long long)bk_in.lver, bk_in.flags); bk_str[BK_STR_SIZE-1] = '\0'; strncat(bk_debug, bk_str, BK_STR_SIZE-1); bk_debug_count++; } rv = verify_dblock(token, bk, checksum); if (rv < 0) continue; check_mode_block(token, next_lver, q, (char *)bk_end); if (bk->lver < dblock.lver) continue; if (bk->lver > dblock.lver) { log_warnt(token, "ballot %llu abort1 larger lver in bk[%d] %llu:%llu:%llu:%llu:%llu:%llu " "our dblock %llu:%llu:%llu:%llu:%llu:%llu", (unsigned long long)next_lver, q, (unsigned long long)bk->mbal, (unsigned long long)bk->bal, (unsigned long long)bk->inp, (unsigned long long)bk->inp2, (unsigned long long)bk->inp3, (unsigned long long)bk->lver, (unsigned long long)dblock.mbal, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3, (unsigned long long)dblock.lver); log_token(token, "ballot %llu phase1 read %s", (unsigned long long)next_lver, bk_debug); error = SANLK_DBLOCK_LVER; goto out; } /* see "It aborts the ballot" in comment above */ if (bk->mbal > dblock.mbal) { log_warnt(token, "ballot %llu abort1 larger mbal in bk[%d] %llu:%llu:%llu:%llu:%llu:%llu " "our dblock %llu:%llu:%llu:%llu:%llu:%llu", (unsigned long long)next_lver, q, (unsigned long long)bk->mbal, (unsigned long long)bk->bal, (unsigned long long)bk->inp, (unsigned long long)bk->inp2, (unsigned long long)bk->inp3, (unsigned long long)bk->lver, (unsigned long long)dblock.mbal, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3, (unsigned long long)dblock.lver); log_token(token, "ballot %llu phase1 read %s", (unsigned long long)next_lver, bk_debug); error = SANLK_DBLOCK_MBAL; goto out; } /* see choosing inp for phase 2 in comment below */ if (!bk->inp) continue; if (!bk->bal) { log_errot(token, "ballot %llu zero bal inp[%d] %llu", (unsigned long long)next_lver, q, (unsigned long long)bk->inp); continue; } if (bk->bal > bk_max.bal) { bk_max = *bk; q_max = q; } } } log_token(token, "ballot %llu phase1 read %s", (unsigned long long)next_lver, bk_debug); if (!majority_disks(num_disks, num_reads)) { log_errot(token, "ballot %llu dblock read error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_READ; goto out; } /* * "When it completes phase 1, p chooses a new value of dblock[p].inp, * sets dblock[p].bal to dblock[p].mbal (its current ballot number), * and begins phase 2." * * "We now describe how processor p chooses the value of dblock[p].inp * that it tries to commit in phase 2. Let blocksSeen be the set * consisting of dblock[p] and all the records disk[d][q] read by p in * phase 1. Let nonInitBlks be the subset of blocksSeen consisting of * those records whose inp field is not NotAnInput. If nonInitBlks is * empty, then p sets dblock[p].inp to its own input value input[p]. * Otherwise, it sets dblock[p].inp to bk.inp for some record bk in * nonInitBlks having the largest value of bk.bal." */ if (bk_max.inp) { /* lver and mbal are already set */ dblock.inp = bk_max.inp; dblock.inp2 = bk_max.inp2; dblock.inp3 = bk_max.inp3; } else { /* lver and mbal are already set */ dblock.inp = token->host_id; dblock.inp2 = token->host_generation; dblock.inp3 = monotime(); } dblock.bal = dblock.mbal; dblock.checksum = 0; /* set after paxos_dblock_out */ if (bk_max.inp) { log_token(token, "ballot %llu choose bk_max[%d] lver %llu mbal %llu bal %llu inp %llu %llu %llu", (unsigned long long)next_lver, q_max, (unsigned long long)bk_max.lver, (unsigned long long)bk_max.mbal, (unsigned long long)bk_max.bal, (unsigned long long)bk_max.inp, (unsigned long long)bk_max.inp2, (unsigned long long)bk_max.inp3); } /* * phase 2 * * Same description as phase 1, same sequence of writes/reads. */ phase2 = 1; log_token(token, "ballot %llu phase2 write bal %llu inp %llu %llu %llu q_max %d", (unsigned long long)dblock.lver, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3, q_max); num_writes = 0; for (d = 0; d < num_disks; d++) { /* acquire io: write 2 */ rv = write_dblock(task, token, &token->disks[d], token->host_id, &dblock); if (rv < 0) continue; num_writes++; } if (!majority_disks(num_disks, num_writes)) { log_errot(token, "ballot %llu our dblock write2 error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_WRITE; goto out; } memset(bk_debug, 0, sizeof(bk_debug)); bk_debug_count = 0; num_reads = 0; for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; if (!iobuf[d]) continue; memset(iobuf[d], 0, iobuf_len); /* acquire io: read 3 */ rv = read_iobuf(disk->fd, disk->offset, iobuf[d], iobuf_len, task, token->io_timeout, NULL); if (rv == SANLK_AIO_TIMEOUT) iobuf[d] = NULL; if (rv < 0) continue; num_reads++; for (q = 0; q < num_hosts; q++) { bk_end = (struct paxos_dblock *)(iobuf[d] + ((2 + q)*sector_size)); checksum = dblock_checksum(bk_end); paxos_dblock_in(bk_end, &bk_in); bk = &bk_in; if (bk->mbal && ((flags & PAXOS_ACQUIRE_DEBUG_ALL) || (bk->lver >= dblock.lver))) { if (bk_debug_count >= BK_DEBUG_COUNT) { log_token(token, "ballot %llu phase2 read %s", (unsigned long long)next_lver, bk_debug); memset(bk_debug, 0, sizeof(bk_debug)); bk_debug_count = 0; } memset(bk_str, 0, sizeof(bk_str)); snprintf(bk_str, BK_STR_SIZE, "%d:%llu:%llu:%llu:%llu:%llu:%llu:%x,", q, (unsigned long long)bk->mbal, (unsigned long long)bk->bal, (unsigned long long)bk->inp, (unsigned long long)bk->inp2, (unsigned long long)bk->inp3, (unsigned long long)bk->lver, bk->flags); bk_str[BK_STR_SIZE-1] = '\0'; strncat(bk_debug, bk_str, BK_STR_SIZE-1); bk_debug_count++; } rv = verify_dblock(token, bk, checksum); if (rv < 0) continue; if (bk->lver < dblock.lver) continue; if (bk->lver > dblock.lver) { /* * This happens when we choose another host's bk, that host * acquires the lease itself, releases it, and reacquires it * with a new lver, all before we get here, at which point * we see the larger lver. I believe case this would always * also be caught the the bk->mbal > dblock.mbal condition * below. */ log_warnt(token, "ballot %llu abort2 larger lver in bk[%d] %llu:%llu:%llu:%llu:%llu:%llu " "our dblock %llu:%llu:%llu:%llu:%llu:%llu", (unsigned long long)next_lver, q, (unsigned long long)bk->mbal, (unsigned long long)bk->bal, (unsigned long long)bk->inp, (unsigned long long)bk->inp2, (unsigned long long)bk->inp3, (unsigned long long)bk->lver, (unsigned long long)dblock.mbal, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3, (unsigned long long)dblock.lver); log_token(token, "ballot %llu phase2 read %s", (unsigned long long)next_lver, bk_debug); error = SANLK_DBLOCK_LVER; goto out; } /* see "It aborts the ballot" in comment above */ if (bk->mbal > dblock.mbal) { log_warnt(token, "ballot %llu abort2 larger mbal in bk[%d] %llu:%llu:%llu:%llu:%llu:%llu " "our dblock %llu:%llu:%llu:%llu:%llu:%llu", (unsigned long long)next_lver, q, (unsigned long long)bk->mbal, (unsigned long long)bk->bal, (unsigned long long)bk->inp, (unsigned long long)bk->inp2, (unsigned long long)bk->inp3, (unsigned long long)bk->lver, (unsigned long long)dblock.mbal, (unsigned long long)dblock.bal, (unsigned long long)dblock.inp, (unsigned long long)dblock.inp2, (unsigned long long)dblock.inp3, (unsigned long long)dblock.lver); log_token(token, "ballot %llu phase2 read %s", (unsigned long long)next_lver, bk_debug); error = SANLK_DBLOCK_MBAL; goto out; } } } log_token(token, "ballot %llu phase2 read %s", (unsigned long long)next_lver, bk_debug); if (!majority_disks(num_disks, num_reads)) { log_errot(token, "ballot %llu dblock read2 error %d", (unsigned long long)next_lver, rv); error = SANLK_DBLOCK_READ; goto out; } /* "When it completes phase 2, p has committed dblock[p].inp." */ memcpy(dblock_out, &dblock, sizeof(struct paxos_dblock)); error = SANLK_OK; out: for (d = 0; d < num_disks; d++) { /* don't free iobufs that have timed out */ if (!iobuf[d]) continue; free(iobuf[d]); } if (phase2 && (error < 0) && ((error == SANLK_DBLOCK_READ) || (error == SANLK_DBLOCK_WRITE))) { /* * After phase2 we might "win" the ballot even if we don't complete it * because another host could could pick and commit our dblock values. * If we abort the acquire, but are granted the lease, this would leave * us owning the lease on disk. With this flag, the release path will * try to ensure we are not and do not become the lease owner. */ token->flags |= T_RETRACT_PAXOS; log_errot(token, "ballot %llu retract error %d", (unsigned long long)next_lver, error); } memcpy(dblock_out, &dblock, sizeof(struct paxos_dblock)); return error; } static void log_leader_error(int result, struct token *token, struct sync_disk *disk, struct leader_record *lr, const char *caller) { log_errot(token, "leader1 %s error %d sn %.48s rn %.48s", caller ? caller : "unknown", result, token->r.lockspace_name, token->r.name); log_errot(token, "leader2 path %s offset %llu fd %d", disk->path, (unsigned long long)disk->offset, disk->fd); log_errot(token, "leader3 m %x v %x ss %u nh %llu mh %llu oi %llu og %llu lv %llu", lr->magic, lr->version, lr->sector_size, (unsigned long long)lr->num_hosts, (unsigned long long)lr->max_hosts, (unsigned long long)lr->owner_id, (unsigned long long)lr->owner_generation, (unsigned long long)lr->lver); log_errot(token, "leader4 sn %.48s rn %.48s ts %llu cs %x", lr->space_name, lr->resource_name, (unsigned long long)lr->timestamp, lr->checksum); log_errot(token, "leader5 wi %llu wg %llu wt %llu", (unsigned long long)lr->write_id, (unsigned long long)lr->write_generation, (unsigned long long)lr->write_timestamp); } static int _verify_leader(struct token *token, struct sync_disk *disk, struct leader_record *lr, uint32_t checksum, const char *caller, int print_error) { struct leader_record leader_end; struct leader_record leader_rr; int result, rv; if (lr->magic == PAXOS_DISK_CLEAR) return SANLK_LEADER_MAGIC; if (lr->magic != PAXOS_DISK_MAGIC) { result = SANLK_LEADER_MAGIC; goto fail; } if ((lr->version & 0xFFFF0000) != PAXOS_DISK_VERSION_MAJOR) { result = SANLK_LEADER_VERSION; goto fail; } if (strncmp(lr->space_name, token->r.lockspace_name, NAME_ID_SIZE)) { result = SANLK_LEADER_LOCKSPACE; goto fail; } if (strncmp(lr->resource_name, token->r.name, NAME_ID_SIZE)) { result = SANLK_LEADER_RESOURCE; goto fail; } if (lr->num_hosts < token->host_id) { result = SANLK_LEADER_NUMHOSTS; goto fail; } if (lr->checksum != checksum) { result = SANLK_LEADER_CHECKSUM; goto fail; } return SANLK_OK; fail: if (!print_error) return result; switch (result) { case SANLK_LEADER_MAGIC: log_errot(token, "verify_leader wrong magic %x %s", lr->magic, disk->path); break; case SANLK_LEADER_VERSION: log_errot(token, "verify_leader wrong version %x %s", lr->version, disk->path); break; case SANLK_LEADER_LOCKSPACE: log_errot(token, "verify_leader wrong space name %.48s %.48s %s", lr->space_name, token->r.lockspace_name, disk->path); break; case SANLK_LEADER_RESOURCE: log_errot(token, "verify_leader wrong resource name %.48s %.48s %s", lr->resource_name, token->r.name, disk->path); break; case SANLK_LEADER_NUMHOSTS: log_errot(token, "verify_leader num_hosts too small %llu %llu %s", (unsigned long long)lr->num_hosts, (unsigned long long)token->host_id, disk->path); break; case SANLK_LEADER_CHECKSUM: log_errot(token, "verify_leader wrong checksum %x %x %s", lr->checksum, checksum, disk->path); break; }; log_leader_error(result, token, disk, lr, caller); memset(&leader_end, 0, sizeof(struct leader_record)); rv = read_sectors(disk, token->sector_size, 0, 1, (char *)&leader_end, sizeof(struct leader_record), NULL, 1, "paxos_verify"); leader_record_in(&leader_end, &leader_rr); log_leader_error(rv, token, disk, &leader_rr, "paxos_verify"); return result; } static int verify_leader(struct token *token, struct sync_disk *disk, struct leader_record *lr, uint32_t checksum, const char *caller) { return _verify_leader(token, disk, lr, checksum, caller, 1); } static int verify_leader_no_error(struct token *token, struct sync_disk *disk, struct leader_record *lr, uint32_t checksum, const char *caller) { return _verify_leader(token, disk, lr, checksum, caller, 0); } int paxos_verify_leader(struct token *token, struct sync_disk *disk, struct leader_record *lr, uint32_t checksum, const char *caller) { return verify_leader(token, disk, lr, checksum, caller); } static int leaders_match(struct leader_record *a, struct leader_record *b) { if (!memcmp(a, b, LEADER_COMPARE_LEN)) return 1; return 0; } /* read the lockspace name and resource name given the disk location */ int paxos_read_resource(struct task *task, struct token *token, struct sanlk_resource *res) { struct leader_record leader; uint32_t checksum; int align_size; int tmp_sector_size = 0; int rv; memset(&leader, 0, sizeof(struct leader_record)); /* * We don't know the sector size, so we don't know if we should read * 512 or 4k, but it doesn't matter since the leader record is all that * we need. It's probably better to read 4k on a 512 disk than to read 512 * on a 4k disk, so always do a 4k read. */ if (!token->sector_size) { token->sector_size = 4096; token->align_size = sector_size_to_align_size_old(4096); tmp_sector_size = 1; } rv = read_leader(task, token, &token->disks[0], &leader, &checksum); if (rv < 0) return rv; if (!res->lockspace_name[0]) memcpy(token->r.lockspace_name, leader.space_name, NAME_ID_SIZE); if (!res->name[0]) memcpy(token->r.name, leader.resource_name, NAME_ID_SIZE); if (token->flags & T_CHECK_EXISTS) { if (leader.magic != PAXOS_DISK_MAGIC) rv = SANLK_LEADER_MAGIC; else rv = SANLK_OK; } else { rv = verify_leader_no_error(token, &token->disks[0], &leader, checksum, "read_resource"); } if (rv == SANLK_OK) { memcpy(res->lockspace_name, leader.space_name, NAME_ID_SIZE); memcpy(res->name, leader.resource_name, NAME_ID_SIZE); res->lver = leader.lver; if ((leader.sector_size == 512) || (leader.sector_size == 4096)) { align_size = leader_align_size_from_flag(leader.flags); if (!align_size) align_size = sector_size_to_align_size_old(leader.sector_size); token->sector_size = leader.sector_size; token->align_size = align_size; /* The flags set by the user may be wrong. */ sanlk_res_sector_flags_clear(&res->flags); sanlk_res_align_flags_clear(&res->flags); res->flags |= sanlk_res_sector_size_to_flag(leader.sector_size); res->flags |= sanlk_res_align_size_to_flag(align_size); } else if (tmp_sector_size) { /* we don't know the correct value, so don't set any */ /* FIXME: add a note about when this can happen */ token->sector_size = 0; token->align_size = 0; } } return rv; } int paxos_read_buf(struct task *task, struct token *token, char **buf_out) { char *iobuf, **p_iobuf; struct sync_disk *disk = &token->disks[0]; int rv, iobuf_len; if (!token->sector_size || !token->align_size) { log_errot(token, "paxos_read_buf with sector_size %d align_size %d", token->sector_size, token->align_size); return -EINVAL; } iobuf_len = token->align_size; if (iobuf_len < 0) return iobuf_len; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(iobuf, 0, iobuf_len); rv = read_iobuf(disk->fd, disk->offset, iobuf, iobuf_len, task, token->io_timeout, NULL); *buf_out = iobuf; return rv; } static int _leader_read_one(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller) { struct leader_record leader; uint32_t checksum; int rv; memset(&leader, 0, sizeof(struct leader_record)); rv = read_leader(task, token, &token->disks[0], &leader, &checksum); if (rv < 0) return rv; rv = verify_leader(token, &token->disks[0], &leader, checksum, caller); /* copy what we read even if verify finds a problem */ memcpy(leader_ret, &leader, sizeof(struct leader_record)); return rv; } /* TODO: completely untested */ static int _leader_read_num(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller) { struct leader_record leader; struct leader_record *leaders; uint32_t checksum; int *leader_reps; int leaders_len, leader_reps_len; int num_reads; int num_disks = token->r.num_disks; int rv = 0, d, i, found; int error; leaders_len = num_disks * sizeof(struct leader_record); leader_reps_len = num_disks * sizeof(int); leaders = malloc(leaders_len); if (!leaders) return -ENOMEM; leader_reps = malloc(leader_reps_len); if (!leader_reps) { free(leaders); return -ENOMEM; } /* * find a leader block that's consistent on the majority of disks, * so we can use as the basis for the new leader */ memset(&leader, 0, sizeof(struct leader_record)); memset(leaders, 0, leaders_len); memset(leader_reps, 0, leader_reps_len); num_reads = 0; for (d = 0; d < num_disks; d++) { rv = read_leader(task, token, &token->disks[d], &leaders[d], &checksum); if (rv < 0) continue; rv = verify_leader(token, &token->disks[d], &leaders[d], checksum, caller); if (rv < 0) continue; num_reads++; leader_reps[d] = 1; /* count how many times the same leader block repeats */ for (i = 0; i < d; i++) { if (leaders_match(&leaders[d], &leaders[i])) { leader_reps[i]++; break; } } } if (!majority_disks(num_disks, num_reads)) { log_errot(token, "%s leader read error %d", caller, rv); error = SANLK_LEADER_READ; goto out; } /* check that a majority of disks have the same leader */ found = 0; for (d = 0; d < num_disks; d++) { if (!majority_disks(num_disks, leader_reps[d])) continue; /* leader on d is the same on a majority of disks, leader becomes the prototype for new_leader */ memcpy(&leader, &leaders[d], sizeof(struct leader_record)); found = 1; break; } if (!found) { log_errot(token, "%s leader inconsistent", caller); error = SANLK_LEADER_DIFF; goto out; } error = SANLK_OK; out: memcpy(leader_ret, &leader, sizeof(struct leader_record)); free(leaders); free(leader_reps); return error; } int paxos_lease_leader_read(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller) { int rv; /* _leader_read_num works fine for the single disk case, but we can cut out a bunch of stuff when we know there's one disk */ if (token->r.num_disks > 1) rv = _leader_read_num(task, token, leader_ret, caller); else rv = _leader_read_one(task, token, leader_ret, caller); if (rv == SANLK_OK) log_token(token, "%s leader %llu owner %llu %llu %llu", caller, (unsigned long long)leader_ret->lver, (unsigned long long)leader_ret->owner_id, (unsigned long long)leader_ret->owner_generation, (unsigned long long)leader_ret->timestamp); return rv; } static int _lease_read_one(struct task *task, struct token *token, uint32_t flags, struct sync_disk *disk, struct leader_record *leader_ret, struct paxos_dblock *our_dblock, uint64_t *max_mbal, int *max_q, const char *caller, int log_bk_vals) { char bk_debug[BK_DEBUG_SIZE]; char bk_str[BK_STR_SIZE]; int bk_debug_count; struct leader_record leader_end; struct paxos_dblock our_dblock_end; struct paxos_dblock bk; char *iobuf, **p_iobuf; uint32_t host_id = token->host_id; uint32_t sector_size = token->sector_size; uint32_t checksum; struct paxos_dblock *bk_end; uint64_t tmp_mbal = 0; int q, tmp_q = -1, rv, iobuf_len; iobuf_len = token->align_size; if (iobuf_len < 0) return iobuf_len; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(iobuf, 0, iobuf_len); rv = read_iobuf(disk->fd, disk->offset, iobuf, iobuf_len, task, token->io_timeout, NULL); if (rv < 0) goto out; memcpy(&leader_end, iobuf, sizeof(struct leader_record)); checksum = leader_checksum(&leader_end); leader_record_in(&leader_end, leader_ret); memcpy(&our_dblock_end, iobuf + ((host_id + 1) * sector_size), sizeof(struct paxos_dblock)); paxos_dblock_in(&our_dblock_end, our_dblock); rv = verify_leader(token, disk, leader_ret, checksum, caller); if (rv < 0) goto out; memset(bk_debug, 0, sizeof(bk_debug)); bk_debug_count = 0; for (q = 0; q < leader_ret->num_hosts; q++) { bk_end = (struct paxos_dblock *)(iobuf + ((2 + q) * sector_size)); checksum = dblock_checksum(bk_end); paxos_dblock_in(bk_end, &bk); if (log_bk_vals && bk.mbal && ((flags & PAXOS_ACQUIRE_DEBUG_ALL) || (bk.lver >= leader_ret->lver))) { if (bk_debug_count >= BK_DEBUG_COUNT) { log_token(token, "leader %llu dblocks %s", (unsigned long long)leader_ret->lver, bk_debug); memset(bk_debug, 0, sizeof(bk_debug)); bk_debug_count = 0; } memset(bk_str, 0, sizeof(bk_str)); snprintf(bk_str, BK_STR_SIZE, "%d:%llu:%llu:%llu:%llu:%llu:%llu:%x,", q, (unsigned long long)bk.mbal, (unsigned long long)bk.bal, (unsigned long long)bk.inp, (unsigned long long)bk.inp2, (unsigned long long)bk.inp3, (unsigned long long)bk.lver, bk.flags); bk_str[BK_STR_SIZE-1] = '\0'; strncat(bk_debug, bk_str, BK_STR_SIZE-1); bk_debug_count++; } rv = verify_dblock(token, &bk, checksum); if (rv < 0) goto out; if (!tmp_mbal || bk.mbal > tmp_mbal) { tmp_mbal = bk.mbal; tmp_q = q; } } *max_mbal = tmp_mbal; *max_q = tmp_q; if (log_bk_vals) log_token(token, "leader %llu owner %llu %llu %llu dblocks %s", (unsigned long long)leader_ret->lver, (unsigned long long)leader_ret->owner_id, (unsigned long long)leader_ret->owner_generation, (unsigned long long)leader_ret->timestamp, bk_debug); out: if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } /* TODO: completely untested */ static int _lease_read_num(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, struct paxos_dblock *our_dblock, uint64_t *max_mbal, int *max_q, const char *caller) { struct paxos_dblock dblock_one; struct leader_record leader_one; struct leader_record *leaders; uint64_t tmp_mbal = 0; uint64_t mbal_one; int *leader_reps; int num_disks = token->r.num_disks; int leaders_len, leader_reps_len; int i, d, rv = 0, found, num_reads, q_one, tmp_q = -1; leaders_len = num_disks * sizeof(struct leader_record); leader_reps_len = num_disks * sizeof(int); leaders = malloc(leaders_len); if (!leaders) return -ENOMEM; leader_reps = malloc(leader_reps_len); if (!leader_reps) { free(leaders); return -ENOMEM; } memset(leaders, 0, leaders_len); memset(leader_reps, 0, leader_reps_len); num_reads = 0; for (d = 0; d < num_disks; d++) { rv = _lease_read_one(task, token, flags, &token->disks[d], &leader_one, &dblock_one, &mbal_one, &q_one, caller, 0); if (rv < 0) continue; num_reads++; if (!tmp_mbal || mbal_one > tmp_mbal) { tmp_mbal = mbal_one; tmp_q = q_one; memcpy(our_dblock, &dblock_one, sizeof(struct paxos_dblock)); } memcpy(&leaders[d], &leader_one, sizeof(struct leader_record)); leader_reps[d] = 1; /* count how many times the same leader block repeats */ for (i = 0; i < d; i++) { if (leaders_match(&leaders[d], &leaders[i])) { leader_reps[i]++; break; } } } *max_mbal = tmp_mbal; *max_q = tmp_q; if (!num_reads) { log_errot(token, "%s lease_read_num cannot read disks %d", caller, rv); rv = SANLK_DBLOCK_READ; goto out; } found = 0; for (d = 0; d < num_disks; d++) { if (!majority_disks(num_disks, leader_reps[d])) continue; /* leader on d is the same on a majority of disks, leader becomes the prototype for new_leader */ memcpy(leader_ret, &leaders[d], sizeof(struct leader_record)); found = 1; break; } if (!found) { log_errot(token, "%s lease_read_num leader inconsistent", caller); rv = SANLK_LEADER_DIFF; } out: free(leaders); free(leader_reps); return rv; } /* * read all the initial values needed to start disk paxos: * - the leader record * - our own dblock * - the max mbal from all dblocks * * Read the entire lease area in one i/o and copy all those * values from it. */ static int paxos_lease_read(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, uint64_t *max_mbal, const char *caller, int log_bk_vals) { struct paxos_dblock our_dblock; int rv, q = -1; if (token->r.num_disks > 1) rv = _lease_read_num(task, token, flags, leader_ret, &our_dblock, max_mbal, &q, caller); else rv = _lease_read_one(task, token, flags, &token->disks[0], leader_ret, &our_dblock, max_mbal, &q, caller, log_bk_vals); if (rv == SANLK_OK) log_token(token, "%s leader %llu owner %llu %llu %llu max mbal[%d] %llu " "our_dblock %llu %llu %llu %llu %llu %llu", caller, (unsigned long long)leader_ret->lver, (unsigned long long)leader_ret->owner_id, (unsigned long long)leader_ret->owner_generation, (unsigned long long)leader_ret->timestamp, q, (unsigned long long)*max_mbal, (unsigned long long)our_dblock.mbal, (unsigned long long)our_dblock.bal, (unsigned long long)our_dblock.inp, (unsigned long long)our_dblock.inp2, (unsigned long long)our_dblock.inp3, (unsigned long long)our_dblock.lver); return rv; } static int write_new_leader(struct task *task, struct token *token, struct leader_record *nl, const char *caller) { int num_disks = token->r.num_disks; int num_writes = 0; int timeout = 0; int rv = 0; int d; for (d = 0; d < num_disks; d++) { rv = write_leader(task, token, &token->disks[d], nl); if (rv == SANLK_AIO_TIMEOUT) timeout = 1; if (rv < 0) continue; num_writes++; } if (!majority_disks(num_disks, num_writes)) { log_errot(token, "%s write_new_leader error %d timeout %d owner %llu %llu %llu", caller, rv, timeout, (unsigned long long)nl->owner_id, (unsigned long long)nl->owner_generation, (unsigned long long)nl->timestamp); if (timeout) return SANLK_AIO_TIMEOUT; if (rv < 0) return rv; return SANLK_LEADER_WRITE; } return SANLK_OK; } /* * If we hang or crash after completing a ballot successfully, but before * commiting the leader_record, then the next host that runs a ballot (with the * same lver since we did not commit the new lver to the leader_record) will * commit the same inp values that we were about to commit. If the inp values * they commit indicate we (who crashed or hung) are the new owner, then the * other hosts will begin monitoring the liveness of our host_id. Once enough * time has passed, they assume we're dead, and go on with new versions. The * "enough time" ensures that if we hung before writing the leader, that we * won't wake up and finally write what will then be an old invalid leader. */ /* * i/o required to acquire a free lease * (1 disk in token, 512 byte sectors, default num_hosts of 2000) * * paxos_lease_acquire() * paxos_lease_read() 1 read 1 MB (entire lease area) * run_ballot() * write_dblock() 1 write 512 bytes (1 dblock sector) * read_iobuf() 1 read 1 MB (round up num_hosts + 2 sectors) * write_dblock() 1 write 512 bytes (1 dblock sector) * read_iobuf() 1 read 1 MB (round up num_hosts + 2 sectors) * write_new_leader() 1 write 512 bytes (1 leader sector) * * 6 i/os = 3 1MB reads, 3 512 byte writes */ int paxos_lease_acquire(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, struct paxos_dblock *dblock_ret, uint64_t acquire_lver, int new_num_hosts) { struct sync_disk host_id_disk; struct leader_record host_id_leader; struct leader_record cur_leader; struct leader_record tmp_leader; struct leader_record new_leader; struct paxos_dblock dblock; struct paxos_dblock owner_dblock; struct host_status hs; uint64_t wait_start, now; uint64_t last_timestamp; uint64_t next_lver; uint64_t max_mbal; uint64_t num_mbal; uint64_t our_mbal; int copy_cur_leader; int disk_open = 0; int error, rv, us; int align_size; int ls_sector_size; int other_io_timeout, other_host_dead_seconds; memset(&dblock, 0, sizeof(dblock)); /* shut up compiler */ log_token(token, "paxos_acquire begin offset %llu 0x%x %d %d", (unsigned long long)token->disks[0].offset, flags, token->sector_size, token->align_size); if (!token->sector_size) { log_errot(token, "paxos_acquire with zero sector_size"); return -EINVAL; } restart: memset(&tmp_leader, 0, sizeof(tmp_leader)); copy_cur_leader = 0; /* acquire io: read 1 */ error = paxos_lease_read(task, token, flags, &cur_leader, &max_mbal, "paxos_acquire", 1); if (error < 0) goto out; align_size = leader_align_size_from_flag(cur_leader.flags); if (!align_size) align_size = sector_size_to_align_size_old(cur_leader.sector_size); /* * token sector_size/align_size are initially set from the lockspace values, * and paxos_lease_read() uses these values. It's possible but unusual * that the paxos lease leader record will have different sector/align * sizes than we used initially. */ if ((cur_leader.sector_size != token->sector_size) || (align_size != token->align_size)) { log_token(token, "paxos_acquire restart with different sizes was %d %d now %d %d", token->sector_size, token->align_size, cur_leader.sector_size, align_size); token->sector_size = cur_leader.sector_size; token->align_size = align_size; goto restart; } if (flags & PAXOS_ACQUIRE_FORCE) { copy_cur_leader = 1; goto run; } if (acquire_lver && cur_leader.lver != acquire_lver) { log_errot(token, "paxos_acquire acquire_lver %llu cur_leader %llu", (unsigned long long)acquire_lver, (unsigned long long)cur_leader.lver); error = SANLK_ACQUIRE_LVER; goto out; } if (cur_leader.timestamp == LEASE_FREE) { log_token(token, "paxos_acquire leader %llu free", (unsigned long long)cur_leader.lver); copy_cur_leader = 1; goto run; } if (cur_leader.owner_id == token->host_id && cur_leader.owner_generation == token->host_generation) { log_token(token, "paxos_acquire owner %llu %llu %llu is already local %llu %llu", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)token->host_id, (unsigned long long)token->host_generation); copy_cur_leader = 1; goto run; } /* * We were the last host to hold this lease, but in a previous * lockspace generation in which we didn't cleanly release the * paxos lease. */ if (cur_leader.owner_id == token->host_id && cur_leader.owner_generation < token->host_generation) { log_token(token, "paxos_acquire owner %llu %llu %llu was old local new is %llu", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)token->host_generation); copy_cur_leader = 1; goto run; } /* * Check if current owner is alive based on its host_id renewals. * If the current owner has been dead long enough we can assume that * its watchdog has triggered and we can go for the paxos lease. */ if (!disk_open) { memset(&host_id_disk, 0, sizeof(host_id_disk)); rv = lockspace_disk(cur_leader.space_name, &host_id_disk, &ls_sector_size); if (rv < 0) { log_errot(token, "paxos_acquire no lockspace info %.48s", cur_leader.space_name); error = SANLK_ACQUIRE_LOCKSPACE; goto out; } host_id_disk.fd = -1; rv = open_disks_fd(&host_id_disk, 1); if (rv < 0) { log_errot(token, "paxos_acquire open host_id_disk error %d", rv); error = SANLK_ACQUIRE_IDDISK; goto out; } disk_open = 1; } rv = host_info(cur_leader.space_name, cur_leader.owner_id, &hs); if (!rv && hs.last_check && hs.last_live && hs.owner_id == cur_leader.owner_id && hs.owner_generation == cur_leader.owner_generation) { wait_start = hs.last_live; last_timestamp = hs.timestamp; } else { wait_start = monotime(); last_timestamp = 0; } log_token(token, "paxos_acquire owner %llu %llu %llu " "host_status %llu %llu %llu wait_start %llu", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)hs.owner_id, (unsigned long long)hs.owner_generation, (unsigned long long)hs.timestamp, (unsigned long long)wait_start); while (1) { error = delta_lease_leader_read(task, ls_sector_size, token->io_timeout, &host_id_disk, cur_leader.space_name, cur_leader.owner_id, &host_id_leader, "paxos_acquire"); if (error < 0) { log_errot(token, "paxos_acquire owner %llu %llu %llu " "delta read %d fd %d path %s off %llu", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, error, host_id_disk.fd, host_id_disk.path, (unsigned long long)host_id_disk.offset); goto out; } /* a host_id cannot become free in less than host_dead_seconds after the final renewal because a host_id must first be acquired before being freed, and acquiring cannot take less than host_dead_seconds */ if (host_id_leader.timestamp == LEASE_FREE) { log_token(token, "paxos_acquire owner %llu delta free", (unsigned long long)cur_leader.owner_id); goto run; } /* another host has acquired the host_id of the host that owned this paxos lease; acquiring a host_id also cannot be done in less than host_dead_seconds, or the host_id that owns this lease may be alive, but it owned the lease in a previous generation without freeing it, and no longer owns it */ if (host_id_leader.owner_id != cur_leader.owner_id || host_id_leader.owner_generation > cur_leader.owner_generation) { log_token(token, "paxos_acquire owner %llu %llu %llu " "delta %llu %llu %llu mismatch", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)host_id_leader.owner_id, (unsigned long long)host_id_leader.owner_generation, (unsigned long long)host_id_leader.timestamp); goto run; } if (!last_timestamp) { last_timestamp = host_id_leader.timestamp; goto skip_live_check; } /* * Check if the owner is alive: * * 1. We just read the delta lease of the owner (host_id_leader). * If that has a newer timestamp than the timestamp last seen by * our own renewal thread (last_timestamp), then the owner is alive. * * 2. If our own renewal thread saw the owner's timestamp change * the last time it was checked, then consider the owner to be alive. */ if ((host_id_leader.timestamp != last_timestamp) || (hs.last_live && (hs.last_check == hs.last_live))) { log_token(token, "paxos_acquire owner %llu delta %llu %llu %llu alive", (unsigned long long)cur_leader.owner_id, (unsigned long long)host_id_leader.owner_id, (unsigned long long)host_id_leader.owner_generation, (unsigned long long)host_id_leader.timestamp); memcpy(leader_ret, &cur_leader, sizeof(struct leader_record)); /* It's possible that the live owner has released the lease, but its release was clobbered by another host that was running the ballot with it and wrote it as the owner. If the leader writer was not the owner, check if the owner's dblock is cleared. If so, then the owner released the lease and we can run a ballot. Comparing the write_id and owner_id is not required; we could always read the owner dblock here, but comparing the writer and owner can eliminate many unnecessary dblock reads. */ if (cur_leader.write_id != cur_leader.owner_id) { rv = read_dblock(task, token, &token->disks[0], cur_leader.owner_id, &owner_dblock); if (!rv && (owner_dblock.flags & DBLOCK_FL_RELEASED)) { /* not an error, but interesting to see */ log_warnt(token, "paxos_acquire owner %llu %llu %llu writer %llu owner dblock released", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)cur_leader.write_id); goto run; } } error = SANLK_ACQUIRE_IDLIVE; goto out; } /* If the owner hasn't renewed its host_id lease for host_dead_seconds then its watchdog should have fired by now. */ now = monotime(); other_io_timeout = hs.io_timeout; other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout); if (now - wait_start > other_host_dead_seconds) { log_token(token, "paxos_acquire owner %llu %llu %llu " "delta %llu %llu %llu dead %llu-%llu>%d", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)host_id_leader.owner_id, (unsigned long long)host_id_leader.owner_generation, (unsigned long long)host_id_leader.timestamp, (unsigned long long)now, (unsigned long long)wait_start, other_host_dead_seconds); goto run; } if (flags & PAXOS_ACQUIRE_OWNER_NOWAIT) { log_token(token, "paxos_acquire owner %llu %llu %llu no wait", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp); error = SANLK_ACQUIRE_OWNED_RETRY; goto out; } skip_live_check: /* TODO: test with sleep(2) here */ sleep(1); if (external_shutdown) { error = -1; goto out; } /* * In this while loop we are waiting for an indication that the * current owner is alive or dead, but if we see the leader * owner change in the meantime, we'll restart the entire * process. */ error = paxos_lease_leader_read(task, token, &tmp_leader, "paxos_acquire"); if (error < 0) goto out; if (memcmp(&cur_leader, &tmp_leader, sizeof(struct leader_record))) { log_token(token, "paxos_acquire restart leader changed1 from " "%llu %llu %llu to %llu %llu %llu", (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp); goto restart; } } run: /* * Use the disk paxos algorithm to attempt to commit a new leader. * * If we complete a ballot successfully, we can commit a leader record * with next_lver. If we find a higher mbal during a ballot, we increase * our own mbal and try the ballot again. * * next_lver is derived from cur_leader with a zero or timed out owner. * We need to monitor the leader record to see if another host commits * a new leader_record with next_lver. * * TODO: may not need to increase mbal if dblock.inp and inp2 match * current host_id and generation? */ /* This next_lver assignment is based on the original cur_leader, not a re-reading of the leader here, i.e. we cannot just re-read the leader here, and make next_lver one more than that. This is because another node may have made us the owner of next_lver as it is now. */ next_lver = cur_leader.lver + 1; if (!max_mbal) { our_mbal = token->host_id; } else { num_mbal = max_mbal - (max_mbal % cur_leader.max_hosts); our_mbal = num_mbal + cur_leader.max_hosts + token->host_id; } retry_ballot: if (copy_cur_leader) { /* reusing the initial read removes an iop in the common case */ copy_cur_leader = 0; memcpy(&tmp_leader, &cur_leader, sizeof(struct leader_record)); } else { /* acquire io: read 1 (for retry) */ error = paxos_lease_leader_read(task, token, &tmp_leader, "paxos_acquire"); if (error < 0) goto out; } if (tmp_leader.lver == next_lver) { /* * another host has commited a leader_record for next_lver, * check which inp (owner_id) they commited (possibly us). */ if (tmp_leader.owner_id == token->host_id && tmp_leader.owner_generation == token->host_generation) { /* not a problem, but interesting to see */ log_warnt(token, "paxos_acquire %llu owner is our inp " "%llu %llu %llu commited by %llu", (unsigned long long)next_lver, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp, (unsigned long long)tmp_leader.write_id); memcpy(leader_ret, &tmp_leader, sizeof(struct leader_record)); memcpy(dblock_ret, &dblock, sizeof(struct paxos_dblock)); error = SANLK_OK; } else { /* not a problem, but interesting to see */ log_warnt(token, "paxos_acquire %llu owner is %llu %llu %llu", (unsigned long long)next_lver, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp); memcpy(leader_ret, &tmp_leader, sizeof(struct leader_record)); error = SANLK_ACQUIRE_OWNED; } goto out; } if (tmp_leader.lver > next_lver) { /* * A case where this was observed: for next_lver 65 we abort1, and delay. * While sleeping, the lease v65 (which was acquired during our abort1) is * released and then reacquired as v66. When we goto retry_ballot, our * next_lver is 65, but the current lver on disk is 66, causing us to * we fail in the larger1 check.) */ log_token(token, "paxos_acquire %llu restart new lver %llu from " "%llu %llu %llu to %llu %llu %llu", (unsigned long long)next_lver, (unsigned long long)tmp_leader.lver, (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp); goto restart; } if (memcmp(&cur_leader, &tmp_leader, sizeof(struct leader_record))) { log_token(token, "paxos_acquire %llu restart leader changed2 from " "%llu %llu %llu to %llu %llu %llu", (unsigned long long)next_lver, (unsigned long long)cur_leader.owner_id, (unsigned long long)cur_leader.owner_generation, (unsigned long long)cur_leader.timestamp, (unsigned long long)tmp_leader.owner_id, (unsigned long long)tmp_leader.owner_generation, (unsigned long long)tmp_leader.timestamp); goto restart; } error = run_ballot(task, token, flags, cur_leader.num_hosts, next_lver, our_mbal, &dblock); if ((error == SANLK_DBLOCK_MBAL) || (error == SANLK_DBLOCK_LVER)) { us = get_rand(0, 1000000); if (us < 0) us = token->host_id * 100; log_token(token, "paxos_acquire %llu retry delay %d us", (unsigned long long)next_lver, us); usleep(us); our_mbal += cur_leader.max_hosts; goto retry_ballot; } if (error < 0) { log_errot(token, "paxos_acquire %llu ballot error %d", (unsigned long long)next_lver, error); goto out; } /* ballot success, commit next_lver with dblock values */ memcpy(&new_leader, &cur_leader, sizeof(struct leader_record)); new_leader.lver = dblock.lver; new_leader.owner_id = dblock.inp; new_leader.owner_generation = dblock.inp2; new_leader.timestamp = dblock.inp3; new_leader.write_id = token->host_id; new_leader.write_generation = token->host_generation; new_leader.write_timestamp = monotime(); if (new_num_hosts) new_leader.num_hosts = new_num_hosts; if (new_leader.owner_id == token->host_id) { /* * The LFL_SHORT_HOLD flag is just a "hint" to help * other nodes be more intelligent about retrying * due to transient failures when acquiring shared * leases. Only modify SHORT_HOLD if we're commiting * ourself as the new owner. If we're commiting another * host as owner, we don't know if they are acquiring * shared or not. */ if (flags & PAXOS_ACQUIRE_SHARED) new_leader.flags |= LFL_SHORT_HOLD; else new_leader.flags &= ~LFL_SHORT_HOLD; } new_leader.checksum = 0; /* set after leader_record_out */ error = write_new_leader(task, token, &new_leader, "paxos_acquire"); if (error < 0) { /* See comment in run_ballot about this flag. */ token->flags |= T_RETRACT_PAXOS; memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); goto out; } if (new_leader.owner_id != token->host_id) { /* not a problem, but interesting to see */ /* It's possible that we commit an outdated owner id/gen here. If we go back to the top and retry, we may find that the owner host_id is alive but with a newer generation, and we'd be able to get the lease by running the ballot again. */ log_warnt(token, "ballot %llu commit other owner %llu %llu %llu", (unsigned long long)new_leader.lver, (unsigned long long)new_leader.owner_id, (unsigned long long)new_leader.owner_generation, (unsigned long long)new_leader.timestamp); memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); error = SANLK_ACQUIRE_OTHER; goto out; } log_token(token, "ballot %llu commit self owner %llu %llu %llu", (unsigned long long)next_lver, (unsigned long long)new_leader.owner_id, (unsigned long long)new_leader.owner_generation, (unsigned long long)new_leader.timestamp); memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); memcpy(dblock_ret, &dblock, sizeof(struct paxos_dblock)); error = SANLK_OK; out: if (disk_open) close_disks(&host_id_disk, 1); return error; } #if 0 int paxos_lease_renew(struct task *task, struct token *token, struct leader_record *leader_last, struct leader_record *leader_ret) { struct leader_record new_leader; int rv, d; int error; for (d = 0; d < token->r.num_disks; d++) { memset(&new_leader, 0, sizeof(struct leader_record)); rv = read_leader(task, token, &token->disks[d], &new_leader); if (rv < 0) continue; if (memcmp(&new_leader, leader_last, sizeof(struct leader_record))) { log_errot(token, "leader changed between renewals"); return SANLK_BAD_LEADER; } } new_leader.timestamp = monotime(); new_leader.checksum = 0; /* set after leader_record_out */ error = write_new_leader(task, token, &new_leader); if (error < 0) goto out; memcpy(leader_ret, &new_leader, sizeof(struct leader_record)); out: return error; } #endif int paxos_lease_release(struct task *task, struct token *token, struct sanlk_resource *resrename, struct leader_record *leader_last, struct leader_record *leader_ret) { struct leader_record leader; struct leader_record *last; int error; error = paxos_lease_leader_read(task, token, &leader, "paxos_release"); if (error < 0) { log_errot(token, "paxos_release leader_read error %d", error); goto out; } /* * Used when the caller does not know who the owner is, but * wants to ensure it is not the owner. */ if (!leader_last) last = &leader; else last = leader_last; /* * This will happen when two hosts finish the same ballot * successfully, the second commiting the same inp values * that the first did, as it should. But the second will * write it's own write_id/gen/timestap, which will differ * from what the first host wrote. So when the first host * rereads here in the release, it will find different * write_id/gen/timestamp from what it wrote. This is * perfectly fine (use log warn since it's interesting * to see when this happens.) * * If another host was the writer and committed us as the * owner, then we don't zero the leader record when we release, * we just release our dblock (by setting the release flag, * already done prior to calling paxos_lease_release). This is * because other hosts will ignore our leader record if we were * not the writer once we release our dblock. Those other * hosts will then run a ballot and commit/write a new leader. * If we are also zeroing the leader, that can race with * another host writing a new leader, and we could clobber the * new leader. */ if (leader.write_id != token->host_id) { log_warnt(token, "paxos_release skip write " "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, (unsigned long long)last->owner_id, (unsigned long long)last->owner_generation, (unsigned long long)last->timestamp, (unsigned long long)last->write_id, (unsigned long long)last->write_generation, (unsigned long long)last->write_timestamp, (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); return 0; } /* * When we were the writer of our own leader record, then * releasing the lease includes both setting the REALEASED flag * in our dblock and clearing out timestamp in the leader. * When we reread the leader here in release, we should find * it the same as we last saw in acquire. */ if (leader.lver != last->lver) { log_errot(token, "paxos_release other lver " "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, (unsigned long long)last->owner_id, (unsigned long long)last->owner_generation, (unsigned long long)last->timestamp, (unsigned long long)last->write_id, (unsigned long long)last->write_generation, (unsigned long long)last->write_timestamp, (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); return SANLK_RELEASE_LVER; } if (leader.timestamp == LEASE_FREE) { log_errot(token, "paxos_release already free " "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, (unsigned long long)last->owner_id, (unsigned long long)last->owner_generation, (unsigned long long)last->timestamp, (unsigned long long)last->write_id, (unsigned long long)last->write_generation, (unsigned long long)last->write_timestamp, (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); return SANLK_RELEASE_OWNER; } if (leader.owner_id != token->host_id || leader.owner_generation != token->host_generation) { log_errot(token, "paxos_release other owner " "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, (unsigned long long)last->owner_id, (unsigned long long)last->owner_generation, (unsigned long long)last->timestamp, (unsigned long long)last->write_id, (unsigned long long)last->write_generation, (unsigned long long)last->write_timestamp, (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); return SANLK_RELEASE_OWNER; } if (memcmp(&leader, last, sizeof(struct leader_record))) { log_errot(token, "paxos_release different vals " "last lver %llu owner %llu %llu %llu writer %llu %llu %llu " "disk lver %llu owner %llu %llu %llu writer %llu %llu %llu", (unsigned long long)last->lver, (unsigned long long)last->owner_id, (unsigned long long)last->owner_generation, (unsigned long long)last->timestamp, (unsigned long long)last->write_id, (unsigned long long)last->write_generation, (unsigned long long)last->write_timestamp, (unsigned long long)leader.lver, (unsigned long long)leader.owner_id, (unsigned long long)leader.owner_generation, (unsigned long long)leader.timestamp, (unsigned long long)leader.write_id, (unsigned long long)leader.write_generation, (unsigned long long)leader.write_timestamp); return SANLK_RELEASE_OWNER; } if (resrename) memcpy(leader.resource_name, resrename->name, NAME_ID_SIZE); leader.timestamp = LEASE_FREE; leader.write_id = token->host_id; leader.write_generation = token->host_generation; leader.write_timestamp = monotime(); leader.flags &= ~LFL_SHORT_HOLD; leader.checksum = 0; /* set after leader_record_out */ error = write_new_leader(task, token, &leader, "paxos_release"); if (error < 0) goto out; memcpy(leader_ret, &leader, sizeof(struct leader_record)); out: return error; } int paxos_lease_init(struct task *task, struct token *token, int num_hosts, int write_clear) { char *iobuf, **p_iobuf; struct leader_record leader; struct leader_record leader_end; struct request_record rr; struct request_record rr_end; uint32_t checksum; int iobuf_len; int sector_size = 0; int align_size = 0; int max_hosts = 0; int aio_timeout = 0; int write_io_timeout = 0; int rv, d; rv = sizes_from_flags(token->r.flags, §or_size, &align_size, &max_hosts, "RES"); if (rv) return rv; if (!sector_size) { /* sector/align flags were not set, use historical defaults */ sector_size = token->disks[0].sector_size; align_size = sector_size_to_align_size_old(sector_size); max_hosts = DEFAULT_MAX_HOSTS; } if (!num_hosts || (num_hosts > max_hosts)) num_hosts = max_hosts; token->sector_size = sector_size; token->align_size = align_size; iobuf_len = align_size; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(iobuf, 0, iobuf_len); memset(&leader, 0, sizeof(leader)); if (write_clear) { leader.magic = PAXOS_DISK_CLEAR; leader.write_timestamp = monotime(); } else { leader.magic = PAXOS_DISK_MAGIC; } leader.timestamp = LEASE_FREE; leader.version = PAXOS_DISK_VERSION_MAJOR | PAXOS_DISK_VERSION_MINOR; leader.flags = leader_align_flag_from_size(align_size); leader.sector_size = sector_size; leader.num_hosts = num_hosts; leader.max_hosts = max_hosts; strncpy(leader.space_name, token->r.lockspace_name, NAME_ID_SIZE); strncpy(leader.resource_name, token->r.name, NAME_ID_SIZE); leader.checksum = 0; /* set after leader_record_out */ memset(&rr, 0, sizeof(rr)); rr.magic = REQ_DISK_MAGIC; rr.version = REQ_DISK_VERSION_MAJOR | REQ_DISK_VERSION_MINOR; leader_record_out(&leader, &leader_end); /* * N.B. must compute checksum after the data has been byte swapped. */ checksum = leader_checksum(&leader_end); leader.checksum = checksum; leader_end.checksum = cpu_to_le32(checksum); request_record_out(&rr, &rr_end); memcpy(iobuf, &leader_end, sizeof(struct leader_record)); memcpy(iobuf + sector_size, &rr_end, sizeof(struct request_record)); /* * The process of initializing the lease on disk can use a * longer timeout than the algorithm uses. */ if (com.write_init_io_timeout) write_io_timeout = com.write_init_io_timeout; for (d = 0; d < token->r.num_disks; d++) { if (!write_io_timeout) write_io_timeout = token->io_timeout; rv = write_iobuf(token->disks[d].fd, token->disks[d].offset, iobuf, iobuf_len, task, write_io_timeout, NULL); if (rv == SANLK_AIO_TIMEOUT) aio_timeout = 1; if (rv < 0) return rv; } if (!aio_timeout) free(iobuf); return 0; } sanlock-3.8.2/src/paxos_lease.h000066400000000000000000000044541371427612200164350ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __PAXOS_LEASE_H__ #define __PAXOS_LEASE_H__ #define PAXOS_ACQUIRE_FORCE 0x00000001 #define PAXOS_ACQUIRE_QUIET_FAIL 0x00000002 #define PAXOS_ACQUIRE_SHARED 0x00000004 #define PAXOS_ACQUIRE_OWNER_NOWAIT 0x00000008 #define PAXOS_ACQUIRE_DEBUG_ALL 0x00000010 uint32_t leader_checksum(struct leader_record *lr); uint32_t dblock_checksum(struct paxos_dblock *pd); int paxos_lease_leader_read(struct task *task, struct token *token, struct leader_record *leader_ret, const char *caller); int paxos_lease_acquire(struct task *task, struct token *token, uint32_t flags, struct leader_record *leader_ret, struct paxos_dblock *dblock_ret, uint64_t acquire_lver, int new_num_hosts); int paxos_lease_release(struct task *task, struct token *token, struct sanlk_resource *resrename, struct leader_record *leader_last, struct leader_record *leader_ret); int paxos_lease_init(struct task *task, struct token *token, int num_hosts, int write_clear); int paxos_lease_request_read(struct task *task, struct token *token, struct request_record *rr); int paxos_lease_request_write(struct task *task, struct token *token, struct request_record *rr); int paxos_read_resource(struct task *task, struct token *token, struct sanlk_resource *res); int paxos_read_buf(struct task *task, struct token *token, char **buf_out); int paxos_verify_leader(struct token *token, struct sync_disk *disk, struct leader_record *lr, uint32_t checksum, const char *caller); int paxos_erase_dblock(struct task *task, struct token *token, uint64_t host_id); int paxos_lease_leader_clobber(struct task *task, struct token *token, struct leader_record *leader, const char *caller); #endif sanlock-3.8.2/src/resource.c000066400000000000000000002214571371427612200157600ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "diskio.h" #include "ondisk.h" #include "log.h" #include "paxos_lease.h" #include "lockspace.h" #include "resource.h" #include "task.h" #include "timeouts.h" #include "helper.h" /* from cmd.c */ void send_state_resource(int fd, struct resource *r, const char *list_name, int pid, uint32_t token_id); /* from main.c */ int get_rand(int a, int b); static pthread_t resource_pt; static int resource_thread_stop; static int resource_thread_work; static int resource_thread_work_examine; static struct list_head resources_free; static struct list_head resources_held; static struct list_head resources_add; static struct list_head resources_rem; static struct list_head resources_orphan; static pthread_mutex_t resource_mutex; static pthread_cond_t resource_cond; static struct list_head host_events; static int resources_free_count; static uint32_t resource_id_counter = 2; /* id 1 used for internal rindex lease */ #define FREE_RES_COUNT 128 /* * There's not much advantage to saving resource structs and reusing them again * when they are requested again. One advantage can be that the res_id remains * unchanged for frequently requested resources, so a new resource description * isn't logged each time it's requested. There may be some other * optimizations that could be added. We may want per-lockspace lists of * resources, or purge free resources when lockspaces are removed. */ static void free_resource(struct resource *r) { struct resource *rtmp = NULL; struct resource *rmin = NULL; if (r->lvb) free(r->lvb); if (resources_free_count < FREE_RES_COUNT) { resources_free_count++; list_add(&r->list, &resources_free); return; } /* the max are being saved, free the least used before saving this one */ list_for_each_entry_reverse(rtmp, &resources_free, list) { if (!rtmp->reused) { list_del(&rtmp->list); free(rtmp); goto out; } if (!rmin || (rtmp->reused < rmin->reused)) rmin = rtmp; } if (rmin) { list_del(&rmin->list); free(rmin); } out: list_add(&r->list, &resources_free); } static struct resource *get_free_resource(struct token *token, int *token_matches) { struct resource *r; /* find a previous r that matches token */ list_for_each_entry(r, &resources_free, list) { if (strcmp(r->r.lockspace_name, token->r.lockspace_name)) continue; if (strcmp(r->r.name, token->r.name)) continue; if (r->r.num_disks != token->r.num_disks) continue; if (strcmp(r->r.disks[0].path, token->r.disks[0].path)) continue; *token_matches = 1; resources_free_count--; list_del(&r->list); r->reused++; return r; } return NULL; } /* N.B. the reporting function looks for the strings "add" and "rem", so if changed, they should be changed in both places. */ void send_state_resources(int fd) { struct resource *r; struct token *token; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { list_for_each_entry(token, &r->tokens, list) send_state_resource(fd, r, "held", token->pid, token->token_id); } list_for_each_entry(r, &resources_add, list) { list_for_each_entry(token, &r->tokens, list) send_state_resource(fd, r, "add", token->pid, token->token_id); } list_for_each_entry(r, &resources_rem, list) send_state_resource(fd, r, "rem", r->pid, 0); list_for_each_entry(r, &resources_orphan, list) send_state_resource(fd, r, "orphan", r->pid, 0); pthread_mutex_unlock(&resource_mutex); } int read_resource_owners(struct task *task, struct token *token, struct sanlk_resource *res, char **send_buf, int *send_len, int *count) { struct leader_record leader; struct leader_record leader_end; struct mode_block mb; struct sync_disk *disk; struct sanlk_host *host; struct mode_block *mb_end; uint64_t host_id; uint32_t checksum; char *lease_buf_dblock; char *lease_buf = NULL; char *hosts_buf = NULL; const int sector_size_set = token->sector_size != 0; const int align_size_set = token->align_size != 0; int align_size; int host_count = 0; int i, rv; disk = &token->disks[0]; /* If sector size not set, start with the smaller one. */ if (!sector_size_set) token->sector_size = 512; /* If align size not set, default to the older align. */ if (!align_size_set) token->align_size = sector_size_to_align_size_old(token->sector_size); /* we could in-line paxos_read_buf here like we do in read_mode_block */ retry: rv = paxos_read_buf(task, token, &lease_buf); if (rv < 0) { log_errot(token, "read_resource_owners read_buf rv %d", rv); if (lease_buf && (rv != SANLK_AIO_TIMEOUT)) free(lease_buf); return rv; } memcpy(&leader_end, lease_buf, sizeof(struct leader_record)); checksum = leader_checksum(&leader_end); leader_record_in(&leader_end, &leader); align_size = leader_align_size_from_flag(leader.flags); if (!align_size) align_size = sector_size_to_align_size_old(leader.sector_size); /* If caller specified values are incorrect, fail. */ if (sector_size_set && token->sector_size != leader.sector_size) { log_errot(token, "read_resource_owners invalid sector_size: %d actual: %d", token->sector_size, leader.sector_size); rv = -EINVAL; goto out; } if (align_size_set && token->align_size != align_size) { log_errot(token, "read_resource_owners invalid align_size: %d actual: %d", token->align_size, align_size); rv = -EINVAL; goto out; } /* * If the caller did not specify sector size and our guess was wrong, * retry with the actual value */ if ((token->sector_size != leader.sector_size) || (token->align_size != align_size)) { log_debug("read_resource_owners rereading with correct sizes"); token->sector_size = leader.sector_size; token->align_size = align_size; free(lease_buf); lease_buf = NULL; goto retry; } token->sector_size = leader.sector_size; token->align_size = align_size; rv = paxos_verify_leader(token, disk, &leader, checksum, "read_resource_owners"); if (rv < 0) goto out; res->lver = leader.lver; if (leader.timestamp && leader.owner_id) host_count++; for (i = 0; i < leader.num_hosts; i++) { lease_buf_dblock = lease_buf + ((2 + i) * token->sector_size); mb_end = (struct mode_block *)(lease_buf_dblock + MBLOCK_OFFSET); mode_block_in(mb_end, &mb); host_id = i + 1; if (!(mb.flags & MBLOCK_SHARED)) continue; res->flags |= SANLK_RES_SHARED; /* the leader owner has already been counted above; in the ex case it won't have a mode block set */ if (leader.timestamp && leader.owner_id && (host_id == leader.owner_id)) continue; host_count++; } *count = host_count; if (!host_count) { rv = 0; goto out; } hosts_buf = malloc(host_count * sizeof(struct sanlk_host)); if (!hosts_buf) { host_count = 0; rv = -ENOMEM; goto out; } memset(hosts_buf, 0, host_count * sizeof(struct sanlk_host)); host = (struct sanlk_host *)hosts_buf; /* * Usually when leader owner is set, it's an exclusive lock and * we could skip to the end, but if we read while a new shared * owner is being added, we'll see the leader owner set, and * then may see other shared owners in the mode blocks. */ if (leader.timestamp && leader.owner_id) { host->host_id = leader.owner_id; host->generation = leader.owner_generation; host->timestamp = leader.timestamp; host++; } for (i = 0; i < leader.num_hosts; i++) { lease_buf_dblock = lease_buf + ((2 + i) * token->sector_size); mb_end = (struct mode_block *)(lease_buf_dblock + MBLOCK_OFFSET); mode_block_in(mb_end, &mb); host_id = i + 1; if (!(mb.flags & MBLOCK_SHARED)) continue; if (leader.timestamp && leader.owner_id && (host_id == leader.owner_id)) continue; host->host_id = host_id; host->generation = mb.generation; host++; } rv = 0; out: *send_len = host_count * sizeof(struct sanlk_host); *send_buf = hosts_buf; free(lease_buf); return rv; } /* return 1 (is alive) to force a failure if we don't have enough knowledge to know it's really not alive. Later we could have this sit and wait (like paxos_lease_acquire) until we have waited long enough or have enough knowledge to say it's safely dead (unless of course we find it is alive while waiting) */ static int host_live(char *lockspace_name, uint32_t space_id, uint64_t host_id, uint64_t gen) { struct host_status hs; uint64_t now; int other_io_timeout, other_host_dead_seconds; int rv; rv = host_info(lockspace_name, host_id, &hs); if (rv) { log_sid(space_id, "host_live %llu %llu yes host_info %d", (unsigned long long)host_id, (unsigned long long)gen, rv); return 1; } if (!hs.last_check) { log_sid(space_id, "host_live %llu %llu yes unchecked", (unsigned long long)host_id, (unsigned long long)gen); return 1; } /* the host_id lease is free, not being used */ if (!hs.timestamp) { log_sid(space_id, "host_live %llu %llu no lease free", (unsigned long long)host_id, (unsigned long long)gen); return 0; } if (hs.owner_generation > gen) { log_sid(space_id, "host_live %llu %llu no old gen %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.owner_generation); return 0; } now = monotime(); other_io_timeout = hs.io_timeout; other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout); if (!hs.last_live && (now - hs.first_check > other_host_dead_seconds)) { log_sid(space_id, "host_live %llu %llu no first_check %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.first_check); return 0; } if (hs.last_live && (now - hs.last_live > other_host_dead_seconds)) { log_sid(space_id, "host_live %llu %llu no last_live %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.last_live); return 0; } log_sid(space_id, "host_live %llu %llu yes recent first_check %llu last_live %llu", (unsigned long long)host_id, (unsigned long long)gen, (unsigned long long)hs.first_check, (unsigned long long)hs.last_live); return 1; } void check_mode_block(struct token *token, uint64_t next_lver, int q, char *dblock_buf) { struct mode_block *mb_end; struct mode_block mb; mb_end = (struct mode_block *)(dblock_buf + MBLOCK_OFFSET); mode_block_in(mb_end, &mb); if (mb.flags & MBLOCK_SHARED) { set_id_bit(q + 1, token->shared_bitmap, NULL); token->shared_count++; log_token(token, "ballot %llu mode[%d] shared %d gen %llu", (unsigned long long)next_lver, q, token->shared_count, (unsigned long long)mb.generation); } } static int write_host_block(struct task *task, struct token *token, uint64_t host_id, uint64_t mb_gen, uint32_t mb_flags, struct paxos_dblock *pd) { struct sync_disk *disk; struct mode_block mb; struct mode_block mb_end; struct paxos_dblock pd_end; char *iobuf, **p_iobuf; uint64_t offset; uint32_t checksum; int num_disks = token->r.num_disks; int iobuf_len, rv, d; disk = &token->disks[0]; iobuf_len = token->sector_size; if (!iobuf_len) return -EINVAL; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return -ENOMEM; memset(iobuf, 0, iobuf_len); /* * When writing our mode block, we need to keep our dblock * values intact because other hosts may be running the * paxos algorithm and these values need to remain intact * for them to reach the correct result. * * Be very careful that the latest/correct copy of our * dblock values are being used here. A paxos ballot * can get confused/stuck if we write the wrong dblock * values. */ if (pd) { if (pd->inp && (pd->inp != token->host_id)) { /* This should never happen, sanity check. */ log_errot(token, "Ignore bad dblock while writing mblock %llu:%llu:%llu:%llu", (unsigned long long)pd->inp, (unsigned long long)pd->inp2, (unsigned long long)pd->inp3, (unsigned long long)pd->lver); memset(pd, 0, sizeof(struct paxos_dblock)); } else { paxos_dblock_out(pd, &pd_end); } checksum = dblock_checksum(&pd_end); pd->checksum = checksum; pd_end.checksum = cpu_to_le32(checksum); memcpy(iobuf, (char *)&pd_end, sizeof(struct paxos_dblock)); } if (mb_gen || mb_flags) { memset(&mb, 0, sizeof(mb)); mb.flags = mb_flags; mb.generation = mb_gen; mode_block_out(&mb, &mb_end); memcpy(iobuf + MBLOCK_OFFSET, &mb_end, sizeof(struct mode_block)); } for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; offset = disk->offset + ((2 + host_id - 1) * token->sector_size); rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout, NULL); if (rv < 0) break; } if (rv < 0) { log_errot(token, "write_host_block host_id %llu flags %x gen %llu rv %d", (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen, rv); } else { if (pd) log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock %llu:%llu:%llu:%llu:%llu:%llu%s", (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen, (unsigned long long)pd->mbal, (unsigned long long)pd->bal, (unsigned long long)pd->inp, (unsigned long long)pd->inp2, (unsigned long long)pd->inp3, (unsigned long long)pd->lver, (pd->flags & DBLOCK_FL_RELEASED) ? ":RELEASED." : "."); else log_token(token, "write_host_block host_id %llu flags %x gen %llu dblock 0", (unsigned long long)host_id, mb_flags, (unsigned long long)mb_gen); } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } static int write_mblock_zero_dblock_release(struct task *task, struct token *token) { struct paxos_dblock dblock; memcpy(&dblock, &token->resource->dblock, sizeof(dblock)); dblock.flags = DBLOCK_FL_RELEASED; return write_host_block(task, token, token->host_id, 0, 0, &dblock); } static int write_mblock_shared_dblock_release(struct task *task, struct token *token) { struct paxos_dblock dblock; memcpy(&dblock, &token->resource->dblock, sizeof(dblock)); dblock.flags = DBLOCK_FL_RELEASED; return write_host_block(task, token, token->host_id, token->host_generation, MBLOCK_SHARED, &dblock); } static int read_mode_block(struct task *task, struct token *token, uint64_t host_id, struct mode_block *mb_out) { struct sync_disk *disk; struct mode_block *mb_end; struct mode_block mb; char *iobuf, **p_iobuf; uint64_t offset; int num_disks = token->r.num_disks; int iobuf_len, rv, d; disk = &token->disks[0]; iobuf_len = token->sector_size; if (!iobuf_len) return -EINVAL; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return -ENOMEM; for (d = 0; d < num_disks; d++) { disk = &token->disks[d]; offset = disk->offset + ((2 + host_id - 1) * token->sector_size); rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout, NULL); if (rv < 0) break; mb_end = (struct mode_block *)(iobuf + MBLOCK_OFFSET); mode_block_in(mb_end, &mb); memcpy(mb_out, &mb, sizeof(struct mode_block)); /* FIXME: combine results for multi-disk case */ break; } if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } static int clear_dead_shared(struct task *task, struct token *token, int num_hosts, int *live_count) { struct mode_block mb; uint64_t host_id; int i, rv = 0, live = 0; for (i = 0; i < num_hosts; i++) { host_id = i + 1; if (host_id == token->host_id) continue; if (!test_id_bit(host_id, token->shared_bitmap)) continue; memset(&mb, 0, sizeof(mb)); rv = read_mode_block(task, token, host_id, &mb); if (rv < 0) { log_errot(token, "clear_dead_shared read_mode_block %llu %d", (unsigned long long)host_id, rv); return rv; } log_token(token, "clear_dead_shared host_id %llu mode_block: flags %x gen %llu", (unsigned long long)host_id, mb.flags, (unsigned long long)mb.generation); /* * We get to this function because we saw the shared flag during * paxos, but the holder of the shared lease may have dropped their * shared lease and cleared the mode_block since then. */ if (!(mb.flags & MBLOCK_SHARED)) continue; if (!mb.generation) { /* shouldn't happen; if the shared flag is set, the generation should also be set. */ log_errot(token, "clear_dead_shared host_id %llu mode_block: flags %x gen %llu", (unsigned long long)host_id, mb.flags, (unsigned long long)mb.generation); continue; } if (host_live(token->r.lockspace_name, token->space_id, host_id, mb.generation)) { log_token(token, "clear_dead_shared host_id %llu gen %llu alive", (unsigned long long)host_id, (unsigned long long)mb.generation); live++; continue; } rv = write_host_block(task, token, host_id, 0, 0, NULL); if (rv < 0) { log_errot(token, "clear_dead_shared host_id %llu write_host_block %d", (unsigned long long)host_id, rv); return rv; } /* * not an error, just useful to have a record of when we clear a shared * lock that was left by a failed host. */ log_errot(token, "cleared shared lease for dead host_id %llu gen %llu", (unsigned long long)host_id, (unsigned long long)mb.generation); } *live_count = live; return rv; } /* the lvb is the sector after the dblock for host_id 2000, i.e. 2002 */ #define LVB_SECTOR 2002 static int read_lvb_block(struct task *task, struct token *token) { struct sync_disk *disk; struct resource *r; char *iobuf; uint64_t offset; int iobuf_len, rv; r = token->resource; disk = &token->disks[0]; iobuf_len = token->sector_size; iobuf = r->lvb; offset = disk->offset + (LVB_SECTOR * token->sector_size); if (!r->lvb) return 0; rv = read_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout, NULL); return rv; } static int write_lvb_block(struct task *task, struct resource *r, struct token *token) { struct sync_disk *disk; char *iobuf; uint64_t offset; int iobuf_len, rv; disk = &token->disks[0]; iobuf_len = token->sector_size; iobuf = r->lvb; offset = disk->offset + (LVB_SECTOR * token->sector_size); if (!r->lvb) return 0; rv = write_iobuf(disk->fd, offset, iobuf, iobuf_len, task, token->io_timeout, NULL); return rv; } int res_set_lvb(struct sanlk_resource *res, char *lvb, int lvblen) { struct resource *r; int rv = -ENOENT; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { if (strncmp(r->r.lockspace_name, res->lockspace_name, NAME_ID_SIZE)) continue; if (strncmp(r->r.name, res->name, NAME_ID_SIZE)) continue; if (!r->lvb) { rv = -EINVAL; break; } if (lvblen > r->leader.sector_size) { rv = -E2BIG; break; } memcpy(r->lvb, lvb, lvblen); r->flags |= R_LVB_WRITE_RELEASE; rv = 0; break; } pthread_mutex_unlock(&resource_mutex); return rv; } int res_get_lvb(struct sanlk_resource *res, char **lvb_out, int *lvblen) { struct resource *r; char *lvb; int rv = -ENOENT; int len = *lvblen; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { if (strncmp(r->r.lockspace_name, res->lockspace_name, NAME_ID_SIZE)) continue; if (strncmp(r->r.name, res->name, NAME_ID_SIZE)) continue; if (!r->lvb) { rv = -EINVAL; break; } if (!len) len = r->leader.sector_size; lvb = malloc(len); if (!lvb) { rv = -ENOMEM; break; } memcpy(lvb, r->lvb, len); *lvb_out = lvb; *lvblen = len; rv = 0; break; } pthread_mutex_unlock(&resource_mutex); return rv; } /* return < 0 on error, 1 on success */ static int acquire_disk(struct task *task, struct token *token, uint64_t acquire_lver, int new_num_hosts, int owner_nowait, struct leader_record *leader, struct paxos_dblock *dblock) { struct leader_record leader_tmp; int rv; uint32_t flags = 0; if (com.quiet_fail) flags |= PAXOS_ACQUIRE_QUIET_FAIL; if (com.paxos_debug_all) flags |= PAXOS_ACQUIRE_DEBUG_ALL; if (token->acquire_flags & SANLK_RES_SHARED) flags |= PAXOS_ACQUIRE_SHARED; if (owner_nowait) flags |= PAXOS_ACQUIRE_OWNER_NOWAIT; memset(&leader_tmp, 0, sizeof(leader_tmp)); rv = paxos_lease_acquire(task, token, flags, &leader_tmp, dblock, acquire_lver, new_num_hosts); log_token(token, "acquire_disk rv %d lver %llu at %llu", rv, (unsigned long long)leader_tmp.lver, (unsigned long long)leader_tmp.timestamp); memcpy(leader, &leader_tmp, sizeof(struct leader_record)); return rv; /* SANLK_RV */ } /* return < 0 on error, 1 on success */ static int release_disk(struct task *task, struct token *token, struct sanlk_resource *resrename, struct leader_record *leader) { struct leader_record leader_tmp; int rv; rv = paxos_lease_release(task, token, resrename, leader, &leader_tmp); /* log_token(token, "release_disk rv %d", rv); */ if (rv < 0) return rv; memcpy(leader, &leader_tmp, sizeof(struct leader_record)); return rv; /* SANLK_OK */ } /* * This function will: * 1. list_del token from the struct resource (caller frees struct token) * 2. perform on-disk operations to remove this host's ownership of the lease * 3. list_del and free the struct resource * * Normal cases: * * 1. release ex lease * * . zero our dblock values [see *] * (zeroing our mblock at the same time is ok because it's not used) * . Use paxos_lease_release to set LEASE_FREE in leader_record. * . (If r->leader is zero, it implies that the on-disk lease was never * acquired, so all on-disk operations are skipped.) * * 2. release sh lease (R_SHARED is set in r_flags) * * . As a shared lease holder we do not own the leader, so no * change to the leader is needed. * . zero our mblock values (our SHARED flag) * (zeroing our dblock at the same time is ok because it's not used) * * Unusual cases: * * 3. skip all disk operations * * . "nodisk" is used when the caller only needs to remove the token (step 1), * i.e. on an error path prior to any disk operations having been started. * * . the token is being released because the lockspace is failed/dead, * so disk operations are skipped since they'll fail. * * . the token is being released after acquiring the lease failed, * e.g. it was owned by another host. * * 4. try to unwind from failed acquire of a shared lease (R_UNDO_SHARED) * * . A disk operation failed while trying to acquire a shared lease, * so we want to back out and leave the lease unowned. This means * ensuring that our mblock does not have SHARED set and that we * don't own the leader. * . zero our mblock values * . zero our dblock values [see *] * . Use paxos_lease_release to set LEASE_FREE in leader_record. * * 5. try to unwind from failed acquire (R_ERASE_ALL) * * . A disk operation failed at some point while changing a lease, * and we want to clear all ownership/state we have in the lease. * . zero our mblock values * . zero our dblock values [see * and **] * . Use paxos_lease_release to set LEASE_FREE in leader_record. * * (4 and 5 are basically the same and should be combined) * * * Error handling: * * If any on-disk i/o operation times out in step 2, then the struct resource * is moved to the resource_thread for retrying and step 3 is deferred. * The resource_thread will retry the on-disk operations until they succeed, * then free the resource. * * [*] Reason for clearing our dblock when releasing an ex/owned lease: * If we are releasing this lease very quickly after acquiring it, * there's a chance that another host was running the same acquire * ballot that we were and also committed us as the owner of this * lease, writing our inp values to the leader after we did ourself. * That leader write from the other host may happen after the leader * write we will do here releasing ownership. So the release we do * here may be clobbered and lost. The result is that we own the lease * on disk, but don't know it, so it won't be released unless we happen * to acquire and release it again. The solution is that we clear our * dblock in addition to clearing the leader record. Other hosts can * then check our dblock to see if we really do own the lease. If the * leader says we own the lease, but our dblock is cleared, then our * leader write in release was clobbered, and other hosts will run a * ballot to set a new owner. * UPDATE to above: we no longer clear our dblock values because that * can interfere with other hosts running a paxos ballot at the same time, * instead we now set the DBLOCK_FL_RELEASED flag in our dblock, leaving our * other dblock values intact, and other hosts look for this flag to indicate * that we have released. * * [**] For ERASE_ALL we don't want another host running the ballot to select * our dblock values and commit them, making us the owner after we've aborted * the acquire. So, we clear our dblock values first to prevent that from * happening from this point forward. However, another host contending for the * lease at the same time we failed, could already have read our dblock values * from before we cleared them. In the worst case, that host could commit our * dblock values as the new leader, and that new leader write could apppear on * disk up to host_dead_seconds later. So it seems that technically we would * need to monitor the leader for up to host_dead_seconds after clearing our * dblock to check if we become the on-disk owner of the lease. The chances * of all this happening seem so remote that we don't do this monitoring. * The best approach to dealing with the ERASE_ALL case is to run a full ballot * again, to ensure there's a known owner, and then release normally from that * state. We don't attempt to queue up an another async ballot in the error * path either because it would get fairly complicated. If the caller wants * to be extra sure that these obscure cases do not leave an orphaned lease * on disk, it can either: * - repeat the acquire call until it does not fail with a timeout, i.e. * rerun the ballot until there's a known owner * - leave and rejoin the lockspace after an acquire times out, which will * invalidate any on-disk lease state */ static int _release_token(struct task *task, struct token *token, struct sanlk_resource *resrename, int opened, int nodisk) { struct leader_record leader; struct resource *r = token->resource; uint64_t lver; uint32_t r_flags = 0; int retry_async = 0; int last_token = 0; int ret = SANLK_OK; int rv; /* We keep r on the resources_rem list while doing the actual release on disk so another acquire for the same resource will see it on the list and fail. we can't have one thread releasing and another acquiring the same resource. While on the rem list, the resource can't be used by anyone. */ pthread_mutex_lock(&resource_mutex); list_del(&token->list); if (list_empty(&r->tokens)) { list_move(&r->list, &resources_rem); last_token = 1; } lver = r->leader.lver; r_flags = r->flags; pthread_mutex_unlock(&resource_mutex); if ((r_flags & R_SHARED) && !last_token) { /* will release when final sh token is released */ log_token(token, "release_token more shared"); close_disks(token->disks, token->r.num_disks); return SANLK_OK; } if (!last_token) { /* should never happen */ log_errot(token, "release_token exclusive not last"); close_disks(token->disks, token->r.num_disks); return SANLK_ERROR; } if (token->space_dead) { /* don't bother trying disk op which will probably timeout */ close_disks(token->disks, token->r.num_disks); goto out; } if (nodisk) goto out; if (!opened) { rv = open_disks_fd(token->disks, token->r.num_disks); if (rv < 0) { log_errot(token, "release_token open error %d", rv); ret = rv; goto out; } } log_token(token, "release_token r_flags %x lver %llu", r_flags, (unsigned long long)lver); /* * In all cases we want to (or can) clear both dblock and mblock. * * Cases where we want to release ownership of the leader: * . releasing ex lease !(r_flags & R_SHARED) * . R_UNDO_SHARED * . R_ERASE_ALL * * Cases where we don't want to release ownership of the leader: * . releasing sh lease: (r_flags & R_SHARED) */ if (r_flags & R_ERASE_ALL) { rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token erase all write_host_block %d", rv); ret = rv; } if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; /* Even when acquire did not get far enough to get a copy of the leader (!lver), we still want to try to release the leader in case we own it from another host committing our dblock. */ if (!lver) rv = paxos_lease_release(task, token, NULL, NULL, &leader); else rv = paxos_lease_release(task, token, NULL, &r->leader, &leader); if (rv < 0) ret = rv; if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; /* want to see this result in sanlock.log but not worry people with error */ log_warnt(token, "release_token erase all leader lver %llu rv %d", (unsigned long long)lver, rv); } else if (r_flags & R_UNDO_SHARED) { rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token undo shared write_host_block %d", rv); ret = rv; } if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; rv = release_disk(task, token, resrename, &r->leader); if (rv < 0) { log_errot(token, "release_token undo shared release leader %d", rv); ret = rv; } if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; } else if (r_flags & R_SHARED) { /* normal release of sh lease */ rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token shared write_host_block %d", rv); ret = rv; } if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; } else { /* normal release of ex lease */ if (!lver) { /* zero lver means acquire did not get to the point of writing a leader, so we don't need to release the lease on disk. */ close_disks(token->disks, token->r.num_disks); ret = SANLK_OK; goto out; } if (r_flags & R_LVB_WRITE_RELEASE) { rv = write_lvb_block(task, r, token); if (!rv) r->flags &= ~R_LVB_WRITE_RELEASE; else log_errot(token, "release_token write_lvb error %d", rv); /* do we want to give more effort to writing lvb? */ } /* Failure here is not a big deal and can be ignored. */ rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release_token write_host_block %d", rv); rv = release_disk(task, token, resrename, &r->leader); if (rv < 0) { log_errot(token, "release_token release leader %d", rv); ret = rv; } if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; } close_disks(token->disks, token->r.num_disks); out: if (!retry_async) { if (ret != SANLK_OK) log_token(token, "release_token error %d r_flags %x", ret, r_flags); else log_token(token, "release_token done r_flags %x", r_flags); pthread_mutex_lock(&resource_mutex); list_del(&r->list); free_resource(r); pthread_mutex_unlock(&resource_mutex); return ret; } /* * If a transient i/o error prevented the release on disk, * then handle this like an async release; set R_THREAD_RELEASE, * leave r on resources_rem, let resource_thread_release attempt * to release it. We don't want to leave the lease locked on * disk, preventing others from acquiring it. */ log_errot(token, "release_token timeout r_flags %x", r_flags); pthread_mutex_lock(&resource_mutex); r->flags |= R_THREAD_RELEASE; pthread_mutex_unlock(&resource_mutex); return SANLK_AIO_TIMEOUT; } static int release_token_nodisk(struct task *task, struct token *token) { return _release_token(task, token, NULL, 0, 1); } static int release_token_opened(struct task *task, struct token *token) { return _release_token(task, token, NULL, 1, 0); } int release_token(struct task *task, struct token *token, struct sanlk_resource *resrename) { return _release_token(task, token, resrename, 0, 0); } /* We're releasing a token from the main thread, in which we don't want to block, so we can't do a real release involving disk io. So, pass the release off to the resource_thread. */ void release_token_async(struct token *token) { struct resource *r = token->resource; pthread_mutex_lock(&resource_mutex); list_del(&token->list); if (list_empty(&r->tokens)) { if (token->space_dead || !r->leader.lver) { /* don't bother trying to release if the lockspace is dead (release will probably fail), or the lease was never acquired */ list_del(&r->list); free_resource(r); } else if (token->acquire_flags & SANLK_RES_PERSISTENT) { list_move(&r->list, &resources_orphan); } else { r->flags |= R_THREAD_RELEASE; resource_thread_work = 1; list_move(&r->list, &resources_rem); pthread_cond_signal(&resource_cond); } } pthread_mutex_unlock(&resource_mutex); } static struct resource *find_resource(struct token *token, struct list_head *head) { struct resource *r; list_for_each_entry(r, head, list) { if (strncmp(r->r.lockspace_name, token->r.lockspace_name, NAME_ID_SIZE)) continue; if (strncmp(r->r.name, token->r.name, NAME_ID_SIZE)) continue; return r; } return NULL; } /* * Determines if lockspace is "used" for the purpose of * rem_lockspace(REM_UNUSED). */ int lockspace_is_used(struct sanlk_lockspace *ls) { struct resource *r; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE)) goto yes; } list_for_each_entry(r, &resources_add, list) { if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE)) goto yes; } list_for_each_entry(r, &resources_rem, list) { if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE)) goto yes; } list_for_each_entry(r, &resources_orphan, list) { if (!strncmp(r->r.lockspace_name, ls->name, NAME_ID_SIZE)) goto yes; } pthread_mutex_unlock(&resource_mutex); return 0; yes: pthread_mutex_unlock(&resource_mutex); return 1; } int resource_orphan_count(char *space_name) { struct resource *r; int count = 0; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_orphan, list) { if (!strncmp(r->r.lockspace_name, space_name, NAME_ID_SIZE)) count++; } pthread_mutex_unlock(&resource_mutex); return count; } static void copy_disks(void *dst, void *src, int num_disks) { struct sync_disk *d, *s; int i; d = (struct sync_disk *)dst; s = (struct sync_disk *)src; for (i = 0; i < num_disks; i++) { memcpy(d->path, s->path, SANLK_PATH_LEN); d->offset = s->offset; d->sector_size = s->sector_size; /* fd's are private */ d->fd = -1; d++; s++; } } static struct resource *get_resource(struct token *token, int *new_id) { struct resource *r; int token_matches = 0; uint32_t res_id = 0; uint32_t reused = 0; int disks_len, r_len; disks_len = token->r.num_disks * sizeof(struct sync_disk); r_len = sizeof(struct resource) + disks_len; r = get_free_resource(token, &token_matches); if (r && token_matches) { res_id = r->res_id; reused = r->reused; *new_id = 0; } else if (r) { res_id = resource_id_counter++; *new_id = 1; } else { r = malloc(r_len); if (!r) return NULL; res_id = resource_id_counter++; *new_id = 1; } memset(r, 0, r_len); /* preserved from one use to the next */ r->res_id = res_id; r->reused = reused; memcpy(&r->r, &token->r, sizeof(struct sanlk_resource)); r->io_timeout = token->io_timeout; /* disks copied after open_disks because open_disks sets sector_size which we want copied */ INIT_LIST_HEAD(&r->tokens); r->host_id = token->host_id; r->host_generation = token->host_generation; if (token->acquire_flags & SANLK_RES_SHARED) { r->flags |= R_SHARED; } else { r->pid = token->pid; if (token->flags & T_RESTRICT_SIGKILL) r->flags |= R_RESTRICT_SIGKILL; if (token->flags & T_RESTRICT_SIGTERM) r->flags |= R_RESTRICT_SIGTERM; } return r; } static int convert_sh2ex_token(struct task *task, struct resource *r, struct token *token, uint32_t cmd_flags) { struct leader_record leader; struct paxos_dblock dblock; uint32_t flags = 0; int live_count = 0; int retries; int error; int rv; memset(&leader, 0, sizeof(leader)); if (cmd_flags & SANLK_CONVERT_OWNER_NOWAIT) flags |= PAXOS_ACQUIRE_OWNER_NOWAIT; if (com.quiet_fail) flags |= PAXOS_ACQUIRE_QUIET_FAIL; if (com.paxos_debug_all) flags |= PAXOS_ACQUIRE_DEBUG_ALL; /* paxos_lease_acquire modifies these token values, and we check them after */ token->shared_count = 0; memset(token->shared_bitmap, 0, HOSTID_BITMAP_SIZE); /* Using a token flag like this to manipulate the write_dblock to preserve our mblock is ugly. The diskio/paxos/resource layer separations are not quite right, but would take some major effort to change. The flag is needed to prevent the ballot from clobbering our SHARED mblock. Rewriting our mblock after acquire isn't safe because if the paxos acquire doesn't succeed, then we don't hold any lease for a time. */ token->flags |= T_WRITE_DBLOCK_MBLOCK_SH; rv = paxos_lease_acquire(task, token, flags, &leader, &dblock, 0, 0); token->flags &= ~T_WRITE_DBLOCK_MBLOCK_SH; if (rv < 0) { log_token(token, "convert_sh2ex acquire error %d t_flags %x", rv, token->flags); /* If the acquire failed before anything important was written, then this RETRACT flag will not be set, and there is nothing to undo/cleanup; we can simply return an error. Otherwise, the acquire failed part way through, and we need to try to clean up our state on disk. Do on-disk release of owner. Keep token and SH mblock. */ if (token->flags & T_RETRACT_PAXOS) { token->flags &= ~T_RETRACT_PAXOS; error = rv; goto fail; } return rv; } memcpy(&r->leader, &leader, sizeof(struct leader_record)); memcpy(&r->dblock, &dblock, sizeof(dblock)); token->r.lver = leader.lver; /* paxos_lease_acquire set token->shared_count to the number of SHARED mode blocks it found. It should find at least 1 for our own shared mode block. */ log_token(token, "convert_sh2ex shared_count %d", token->shared_count); if (token->shared_count == 1) goto do_mb; if (!token->shared_count) { /* should never happen */ log_errot(token, "convert_sh2ex zero shared_count"); goto do_mb; } rv = clear_dead_shared(task, token, leader.num_hosts, &live_count); if (rv < 0) { log_errot(token, "convert_sh2ex clear_dead error %d", rv); /* Do on-disk release of owner. Keep token and SH mblock. */ error = rv; goto fail; } log_token(token, "convert_sh2ex live_count %d", live_count); if (live_count) { /* * The convert fails because a live host with a sh lock exists. * The token/lease is kept shared, the lease owner is released. * Our SHARED mblock bit is still set on disk because * T_WRITE_DBLOCK_MBLOCK_SH kept it set during acquire, * so we only need to release the lease owner. */ rv = release_disk(task, token, NULL, &leader); if (rv < 0) { log_errot(token, "convert_sh2ex release_disk error %d", rv); /* Do on-disk release of owner. Keep token and SH mblock. */ error = rv; goto fail; } /* standard exit when convert fails due to other shared locks */ return -EAGAIN; } do_mb: rv = write_host_block(task, token, token->host_id, 0, 0, &dblock); if (rv < 0) { log_errot(token, "convert_sh2ex write_host_block error %d", rv); /* We have the ex lease, so return success. We just failed to clear our SH mblock. When we later release this lease, the release includes clearing the dblock/mblock, so there's not really anything we need to do. */ } /* TODO: clean up the duplication of stuff among: t, t->r, r, r->r */ token->r.flags &= ~SANLK_RES_SHARED; token->acquire_flags &= ~SANLK_RES_SHARED; r->r.flags &= ~SANLK_RES_SHARED; r->flags &= ~R_SHARED; return SANLK_OK; fail: /* * We want to fail and return an error to the caller while keeping * the existing shared lease, and not being the ex owner. * * There's no easy way to pass off the undo of dblock/owner while * keeping the lease token which still represents our sh lease, so * we'll just retry here. We don't want to retry forever, so there's * an arbitrary limit. If we reach the limit, we may want to pass back * a new error to indicate that the lease may be in a non-standard * state, e.g. both owner and mblock sh are set. The caller will see * the error, know that it still holds a sh lease, but the owner may be * in limbo. To clear the lease state, it should release the lease * or leave/rejoin the lockspace. We set ERASE_ALL on the resource * here so that if/when the caller releases its lease (explicitly or * implicitly by exit), the release_token will clear owner/dblock/mblock. * * As elsewhere, non-timeout errors during disk operations should not * happen, are considered uncorrectable, are not retried, and the * lockspace/leases should be considered invalid. */ if (token->space_dead) return error; retries = 0; retry: rv = paxos_lease_release(task, token, NULL, leader.lver ? &leader : NULL, &leader); if ((rv == SANLK_AIO_TIMEOUT) && (retries < 3)) { retries++; log_errot(token, "convert_sh2ex fail %d undo owner timeout", retries); sleep(token->io_timeout); goto retry; } else if (rv < 0) { log_errot(token, "convert_sh2ex fail %d undo owner error %d", retries, rv); r->flags |= R_ERASE_ALL; return error; } /* We've managed to release the owner, so the lease is in a standard state with ourselves having a shared lease and not holding the owner ex. */ return error; } static int convert_ex2sh_token(struct task *task, struct resource *r, struct token *token) { struct leader_record leader; int fail_count = 0; int rv; memcpy(&leader, &r->leader, sizeof(leader)); if (r->flags & R_LVB_WRITE_RELEASE) write_lvb_block(task, r, token); rv = write_mblock_shared_dblock_release(task, token); if (rv < 0) { log_errot(token, "convert_ex2sh write_host_block error %d", rv); return rv; } retry: /* the token is kept, the paxos lease is released but with shared now set */ rv = release_disk(task, token, NULL, &leader); if ((rv == SANLK_AIO_TIMEOUT) && (fail_count < token->io_timeout)) { log_errot(token, "convert_ex2sh release_disk timeout %d", fail_count); fail_count++; if (token->space_dead) return rv; sleep(fail_count); goto retry; } else if (rv < 0) { log_errot(token, "convert_ex2sh release_disk error %d", rv); /* We have sh, and possibly ex. Given this uncertain state on disk, we want release_token to ensure owner/dblock/mblock are all cleared when the lease is released by the client (either explicitly or implicitly when it exits). ERASE_ALL will cause release_token to do this. */ r->flags |= R_ERASE_ALL; return rv; } token->r.flags |= SANLK_RES_SHARED; token->acquire_flags |= SANLK_RES_SHARED; r->r.flags |= SANLK_RES_SHARED; r->flags |= R_SHARED; return SANLK_OK; } int convert_token(struct task *task, struct sanlk_resource *res, struct token *cl_token, uint32_t cmd_flags) { struct resource *r; struct token *tk; struct token *token = NULL; int sh_count = 0; int rv; /* we could probably grab cl_token->r, but it's good to verify */ pthread_mutex_lock(&resource_mutex); r = find_resource(cl_token, &resources_held); if (!r) { pthread_mutex_unlock(&resource_mutex); log_error("convert_token resource not found %.48s:%.48s", cl_token->r.lockspace_name, cl_token->r.name); rv = -ENOENT; goto out; } /* find existing token */ list_for_each_entry(tk, &r->tokens, list) { if (tk == cl_token) token = tk; if (tk->acquire_flags & SANLK_RES_SHARED) sh_count++; } pthread_mutex_unlock(&resource_mutex); if (!token) { log_errot(cl_token, "convert_token token not found pid %d %.48s:%.48s", cl_token->pid, cl_token->r.lockspace_name, cl_token->r.name); rv = -ENOENT; goto out; } if (sh_count && !(r->flags & R_SHARED)) { /* should not be possible */ log_errot(token, "convert_token invalid sh_count %d flags %x", sh_count, r->flags); rv = -EINVAL; goto out; } if (!sh_count && (r->flags & R_SHARED)) { /* should not be possible */ log_errot(token, "convert_token invalid sh_count %d flags %x", sh_count, r->flags); rv = -EINVAL; goto out; } if (!(res->flags & SANLK_RES_SHARED) && !(r->flags & R_SHARED)) { rv = -EALREADY; goto out; } if ((res->flags & SANLK_RES_SHARED) && (r->flags & R_SHARED)) { rv = -EALREADY; goto out; } rv = open_disks_fd(token->disks, token->r.num_disks); if (rv < 0) { log_errot(token, "convert_token open error %d", rv); goto out; } if (!(res->flags & SANLK_RES_SHARED)) { rv = convert_sh2ex_token(task, r, token, cmd_flags); } else if (res->flags & SANLK_RES_SHARED) { rv = convert_ex2sh_token(task, r, token); } else { /* not possible */ rv = -EINVAL; } close_disks(token->disks, token->r.num_disks); out: return rv; } int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, char *killpath, char *killargs) { struct leader_record leader; struct paxos_dblock dblock; struct resource *r; uint64_t acquire_lver = 0; uint32_t new_num_hosts = 0; int sh_retries = 0; int live_count = 0; int allow_orphan = 0; int only_orphan = 0; int owner_nowait = 0; int new_id = 0; int rv; memset(&dblock, 0, sizeof(dblock)); if (token->acquire_flags & SANLK_RES_LVER) acquire_lver = token->acquire_lver; if (token->acquire_flags & SANLK_RES_NUM_HOSTS) new_num_hosts = token->acquire_data32; if (cmd_flags & (SANLK_ACQUIRE_ORPHAN | SANLK_ACQUIRE_ORPHAN_ONLY)) allow_orphan = 1; if (cmd_flags & SANLK_ACQUIRE_ORPHAN_ONLY) only_orphan = 1; if (cmd_flags & SANLK_ACQUIRE_OWNER_NOWAIT) owner_nowait = 1; pthread_mutex_lock(&resource_mutex); /* * Check if this resource already exists on any of the resource lists. */ r = find_resource(token, &resources_rem); if (r) { token->res_id = r->res_id; if (!com.quiet_fail) log_errot(token, "acquire_token resource being removed"); pthread_mutex_unlock(&resource_mutex); return -EAGAIN; } r = find_resource(token, &resources_add); if (r) { token->res_id = r->res_id; if (!com.quiet_fail) log_errot(token, "acquire_token resource being added"); pthread_mutex_unlock(&resource_mutex); return -EBUSY; } r = find_resource(token, &resources_held); if (r && (token->acquire_flags & SANLK_RES_SHARED) && (r->flags & R_SHARED)) { /* multiple shared holders allowed */ token->res_id = r->res_id; log_token(token, "acquire_token add shared"); copy_disks(&token->r.disks, &r->r.disks, token->r.num_disks); token->resource = r; list_add(&token->list, &r->tokens); pthread_mutex_unlock(&resource_mutex); return SANLK_OK; } if (r) { token->res_id = r->res_id; if (!com.quiet_fail) log_errot(token, "acquire_token resource exists"); pthread_mutex_unlock(&resource_mutex); return -EEXIST; } /* caller did not ask for orphan, but an orphan exists */ r = find_resource(token, &resources_orphan); if (r && !allow_orphan) { token->res_id = r->res_id; log_errot(token, "acquire_token found orphan"); pthread_mutex_unlock(&resource_mutex); return -EUCLEAN; } /* caller asked for exclusive orphan, but a shared orphan exists */ if (r && allow_orphan && (r->flags & R_SHARED) && !(token->acquire_flags & SANLK_RES_SHARED)) { token->res_id = r->res_id; log_errot(token, "acquire_token orphan is shared"); pthread_mutex_unlock(&resource_mutex); return -EUCLEAN; } /* caller asked for a shared orphan, but an exclusive orphan exists */ if (r && allow_orphan && !(r->flags & R_SHARED) && (token->acquire_flags & SANLK_RES_SHARED)) { token->res_id = r->res_id; log_errot(token, "acquire_token orphan is exclusive"); pthread_mutex_unlock(&resource_mutex); return -EUCLEAN; } /* caller asked for shared orphan, and a shared orphan exists */ if (r && allow_orphan && (r->flags & R_SHARED) && (token->acquire_flags & SANLK_RES_SHARED)) { token->res_id = r->res_id; log_token(token, "acquire_token adopt shared orphan"); token->resource = r; list_add(&token->list, &r->tokens); list_move(&r->list, &resources_held); pthread_mutex_unlock(&resource_mutex); /* do this to initialize some token fields */ rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { /* TODO: what parts above need to be undone? */ log_errot(token, "acquire_token sh orphan open error %d", rv); release_token_nodisk(task, token); return rv; } close_disks(token->disks, token->r.num_disks); return SANLK_OK; } /* caller asked for exclusive orphan, and an exclusive orphan exists */ if (r && allow_orphan && !(r->flags & R_SHARED) && !(token->acquire_flags & SANLK_RES_SHARED)) { token->res_id = r->res_id; log_token(token, "acquire_token adopt orphan"); token->r.lver = r->leader.lver; r->pid = token->pid; token->resource = r; list_add(&token->list, &r->tokens); list_move(&r->list, &resources_held); pthread_mutex_unlock(&resource_mutex); /* do this to initialize some token fields */ rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { /* TODO: what parts above need to be undone? */ log_errot(token, "acquire_token orphan open error %d", rv); release_token_nodisk(task, token); return rv; } close_disks(token->disks, token->r.num_disks); return SANLK_OK; } /* caller only wants to acquire an orphan */ if (cmd_flags & only_orphan) { pthread_mutex_unlock(&resource_mutex); return -ENOENT; } /* * The resource does not exist, so create it. */ r = get_resource(token, &new_id); if (!r) { pthread_mutex_unlock(&resource_mutex); return -ENOMEM; } memcpy(r->killpath, killpath, SANLK_HELPER_PATH_LEN); memcpy(r->killargs, killargs, SANLK_HELPER_ARGS_LEN); list_add(&token->list, &r->tokens); list_add(&r->list, &resources_add); token->res_id = r->res_id; token->resource = r; pthread_mutex_unlock(&resource_mutex); if (new_id) { /* save a record of what this id is for later debugging */ log_warnt(token, "resource %.48s:%.48s:%.256s:%llu", token->r.lockspace_name, token->r.name, token->r.disks[0].path, (unsigned long long)token->r.disks[0].offset); } rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { log_errot(token, "acquire_token open error %d", rv); release_token_nodisk(task, token); return rv; } copy_disks(&r->r.disks, &token->r.disks, token->r.num_disks); retry: memset(&leader, 0, sizeof(struct leader_record)); rv = acquire_disk(task, token, acquire_lver, new_num_hosts, owner_nowait, &leader, &dblock); /* * token sector_size/align_size starts by using sector_size/align_size * from the ls, but can change in paxos acquire when we see what's in * the leader_record. */ r->sector_size = token->sector_size; r->align_size = token->align_size; if (rv == SANLK_ACQUIRE_IDLIVE || rv == SANLK_ACQUIRE_OWNED || rv == SANLK_ACQUIRE_OTHER) { /* * Another host owns the lease. They may be holding for * only a short time while getting a shared lease. * Multiple parallel sh requests can fail because * the lease is briefly held in ex mode. The ex * holder sets SHORT_HOLD in the leader record to * indicate that it's only held for a short time * while acquiring a shared lease. A retry will * probably succeed. */ if ((token->acquire_flags & SANLK_RES_SHARED) && (leader.flags & LFL_SHORT_HOLD)) { if (sh_retries++ < com.sh_retries) { int us = get_rand(0, 1000000); log_token(token, "acquire_token sh_retry %d %d", rv, us); usleep(us); goto retry; } /* zero r->leader means not owned and release will just close */ release_token_opened(task, token); return SANLK_ACQUIRE_SHRETRY; } if (com.quiet_fail) log_token(token, "acquire_token held error %d", rv); else log_errot(token, "acquire_token held error %d", rv); /* zero r->leader means not owned and release will just close */ release_token_opened(task, token); return rv; } if (rv < 0 && !(token->flags & T_RETRACT_PAXOS)) { log_token(token, "acquire_token disk error %d", rv); r->flags &= ~R_SHARED; /* zero r->leader means not owned and release will just close */ release_token_opened(task, token); return rv; } if (rv < 0 && (token->flags & T_RETRACT_PAXOS)) { /* * We might own the lease, we don't know, so we need to try to * release on disk to avoid possibly having an orphan lease on disk. */ log_errot(token, "acquire_token disk error %d RETRACT_PAXOS", rv); r->flags &= ~R_SHARED; r->flags |= R_ERASE_ALL; memcpy(&r->leader, &leader, sizeof(struct leader_record)); release_token_opened(task, token); return rv; } memcpy(&r->leader, &leader, sizeof(struct leader_record)); memcpy(&r->dblock, &dblock, sizeof(dblock)); /* copy lver into token because inquire looks there for it */ if (!(token->acquire_flags & SANLK_RES_SHARED)) token->r.lver = leader.lver; /* * acquiring shared lease, so we set SHARED in our mode_block * and release the leader owner. */ if (token->acquire_flags & SANLK_RES_SHARED) { rv = write_mblock_shared_dblock_release(task, token); if (rv < 0) { log_errot(token, "acquire_token sh write_host_block error %d", rv); r->flags &= ~R_SHARED; r->flags |= R_UNDO_SHARED; release_token_opened(task, token); return rv; } /* the token is kept, the paxos lease is released but with shared set */ rv = release_disk(task, token, NULL, &leader); if (rv < 0) { log_errot(token, "acquire_token sh release_disk error %d", rv); r->flags &= ~R_SHARED; r->flags |= R_UNDO_SHARED; release_token_opened(task, token); return rv; } /* normal exit case for successful acquire sh */ goto out; } /* * paxos_lease_acquire() calls check_mode_block() which increments * token->shared_count when it finds a mode block with SHARED set. * Zero shared_count means no one holds it shared, so we're done. * Normal exit case for successful acquire ex. */ if (!token->shared_count) { goto out; } /* * acquiring normal ex lease, other hosts have it shared. * check if those other hosts are alive or dead (clear any that are dead). */ /* * paxos_lease_acquire() counted some SHARED mode blocks. * Here we check if they are held by live hosts. If a host * with SHARED mb is dead, we clear it, otherwise it's alive * and we count it in live_count. */ rv = clear_dead_shared(task, token, leader.num_hosts, &live_count); if (rv < 0) { log_errot(token, "acquire_token clear_dead_shared error %d", rv); release_token_opened(task, token); return rv; } /* * acquiring normal ex lease, other hosts have it shared and are alive. * normal exit case for acquire ex that failed due to existing sh lock. */ if (live_count) { rv = release_token_opened(task, token); if (rv < 0) { log_errot(token, "acquire_token live_count release error %d", rv); return rv; } return -EAGAIN; } out: if (cmd_flags & SANLK_ACQUIRE_LVB) { char *iobuf, **p_iobuf; p_iobuf = &iobuf; /* TODO: we should probably notify the caller somehow about lvb read/write independent of the lease results. */ rv = posix_memalign((void *)p_iobuf, getpagesize(), token->sector_size); if (rv) { log_errot(token, "acquire_token lvb size %d memalign error %d", token->sector_size, rv); } else { r->lvb = iobuf; rv = read_lvb_block(task, token); if (rv < 0) log_errot(token, "acquire_token read_lvb error %d", rv); } } close_disks(token->disks, token->r.num_disks); pthread_mutex_lock(&resource_mutex); list_move(&r->list, &resources_held); pthread_mutex_unlock(&resource_mutex); return SANLK_OK; } int request_token(struct task *task, struct token *token, uint32_t force_mode, uint64_t *owner_id, int next_lver) { struct leader_record leader; struct request_record req; int align_size; int rv; memset(&req, 0, sizeof(req)); rv = open_disks(token->disks, token->r.num_disks); if (rv < 0) { log_errot(token, "request_token open error %d", rv); return rv; } if (!token->acquire_lver && !force_mode) goto req_read; /* * cmd_request() takes the sector_size and align_size from the * lockspace as a starting point, setting them in the token. If the * leader_record for the paxos lease on disk is different, then adjust * the values in the token. */ rv = paxos_lease_leader_read(task, token, &leader, "request"); if (rv < 0) goto out; align_size = leader_align_size_from_flag(leader.flags); if (!align_size) align_size = sector_size_to_align_size_old(leader.sector_size); if ((leader.sector_size != token->sector_size) || (align_size != token->align_size)) { /* paxos lease has different size than we borrowed from the lockspace */ token->sector_size = leader.sector_size; token->align_size = align_size; if (!token->align_size) token->align_size = sector_size_to_align_size_old(leader.sector_size); } if (leader.timestamp == LEASE_FREE) { *owner_id = 0; rv = SANLK_OK; goto out; } *owner_id = leader.owner_id; if (!token->acquire_lver && next_lver) token->acquire_lver = leader.lver + 1; if (leader.lver >= token->acquire_lver) { rv = SANLK_REQUEST_OLD; goto out; } req_read: rv = paxos_lease_request_read(task, token, &req); if (rv < 0) goto out; if (req.magic != REQ_DISK_MAGIC) { rv = SANLK_REQUEST_MAGIC; goto out; } if ((req.version & 0xFFFF0000) != REQ_DISK_VERSION_MAJOR) { rv = SANLK_REQUEST_VERSION; goto out; } if (!token->acquire_lver && !force_mode) goto req_write; /* > instead of >= so multiple hosts can request the same version at once and all succeed */ if (req.lver > token->acquire_lver) { rv = SANLK_REQUEST_LVER; goto out; } req_write: req.version = REQ_DISK_VERSION_MAJOR | REQ_DISK_VERSION_MINOR; req.lver = token->acquire_lver; req.force_mode = force_mode; rv = paxos_lease_request_write(task, token, &req); out: close_disks(token->disks, token->r.num_disks); log_token(token, "request_token rv %d owner %llu lver %llu mode %u", rv, (unsigned long long)*owner_id, (unsigned long long)req.lver, req.force_mode); return rv; } static int examine_token(struct task *task, struct token *token, struct request_record *req_out) { struct request_record req; int rv; memset(&req, 0, sizeof(req)); rv = paxos_lease_request_read(task, token, &req); if (rv < 0) goto out; if (req.magic != REQ_DISK_MAGIC) { rv = SANLK_REQUEST_MAGIC; goto out; } if ((req.version & 0xFFFF0000) != REQ_DISK_VERSION_MAJOR) { rv = SANLK_REQUEST_VERSION; goto out; } memcpy(req_out, &req, sizeof(struct request_record)); out: log_token(token, "examine_token rv %d lver %llu mode %u", rv, (unsigned long long)req.lver, req.force_mode); return rv; } static void do_request(struct token *tt, int pid, uint32_t force_mode) { char killpath[SANLK_HELPER_PATH_LEN]; char killargs[SANLK_HELPER_ARGS_LEN]; struct helper_msg hm; struct resource *r; uint32_t flags; int rv, found = 0; pthread_mutex_lock(&resource_mutex); r = find_resource(tt, &resources_held); if (r && r->pid == pid) { found = 1; flags = r->flags; memcpy(killpath, r->killpath, SANLK_HELPER_PATH_LEN); memcpy(killargs, r->killargs, SANLK_HELPER_ARGS_LEN); } pthread_mutex_unlock(&resource_mutex); if (!found) { log_error("do_request pid %d %.48s:%.48s not found", pid, tt->r.lockspace_name, tt->r.name); return; } log_token(tt, "do_request %d flags %x %.48s:%.48s", pid, flags, tt->r.lockspace_name, tt->r.name); if (helper_kill_fd == -1) { log_error("do_request %d no helper fd", pid); return; } memset(&hm, 0, sizeof(hm)); if (force_mode == SANLK_REQ_FORCE) { hm.type = HELPER_MSG_KILLPID; hm.pid = pid; hm.sig = (flags & R_RESTRICT_SIGKILL) ? SIGTERM : SIGKILL; } else if (force_mode == SANLK_REQ_GRACEFUL) { if (killpath[0]) { hm.type = HELPER_MSG_RUNPATH; memcpy(hm.path, killpath, SANLK_HELPER_PATH_LEN); memcpy(hm.args, killargs, SANLK_HELPER_ARGS_LEN); } else { hm.type = HELPER_MSG_KILLPID; hm.pid = pid; hm.sig = (flags & R_RESTRICT_SIGTERM) ? SIGKILL : SIGTERM; } } else { log_error("do_request %d unknown force_mode %d", pid, force_mode); return; } retry: rv = write(helper_kill_fd, &hm, sizeof(hm)); if (rv == -1 && errno == EINTR) goto retry; if (rv == -1) log_error("do_request %d helper write error %d", pid, errno); } int set_resource_examine(char *space_name, char *res_name) { struct resource *r; int count = 0; pthread_mutex_lock(&resource_mutex); list_for_each_entry(r, &resources_held, list) { if (strncmp(r->r.lockspace_name, space_name, NAME_ID_SIZE)) continue; if (res_name && strncmp(r->r.name, res_name, NAME_ID_SIZE)) continue; r->flags |= R_THREAD_EXAMINE; resource_thread_work = 1; resource_thread_work_examine = 1; count++; } if (count) pthread_cond_signal(&resource_cond); pthread_mutex_unlock(&resource_mutex); return count; } /* * resource_thread * - on-disk lease release for pid's that exit without doing release * - on-disk lease release for which release_token had transient i/o error * - examines request blocks of resources */ static struct resource *find_resource_thread(struct list_head *head, uint32_t flag) { struct resource *r; uint64_t now = monotime(); list_for_each_entry(r, head, list) { if (!(r->flags & flag)) continue; if (flag & R_THREAD_EXAMINE) return r; if (now >= r->thread_release_retry) return r; } return NULL; } /* * When release_token is called from a context where it cannot block by doing * disk io, the token itself is released, but the struct resource is passed to * the resource_thread to do the on-disk operations. * * Also, if release_token gets an io timeout during the disk operations, it * removes the token, but passes the struct resource to the resource_thread * to retry the on-disk release operations. It doesn't want to leave a * potentially locked lease on disk simply due to a transient io error. * * This does this non-token related on-disk release operations. It uses * a fake token emulating the original because the paxos layer wants that. * * As long as the on-disk release fails due to io timeouts, the struct resource * is kept and the on-disk release retried. If another, non-timeout error occurs, * we give up and delete/free the struct resource. */ static void resource_thread_release(struct task *task, struct resource *r, struct token *token) { struct leader_record leader; struct space_info spi; uint32_t r_flags; int retry_async = 0; int rv; r_flags = r->flags; rv = open_disks_fd(token->disks, token->r.num_disks); if (rv < 0) { log_errot(token, "release async open error %d", rv); goto out; } /* The lockspace may fail after the resource was transferred to the resource_thread, so we need to check here if if that's the case. */ rv = lockspace_info(token->r.lockspace_name, &spi); if (rv < 0 || spi.killing_pids) { log_token(token, "release async info %d %d", rv, spi.killing_pids); rv = -1; goto out_close; } /* * See comments in _release_token. * FIXME: avoid duplicating all this from _release_token. */ log_token(token, "release async r_flags %x", r_flags); if (r_flags & R_ERASE_ALL) { rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async erase all write_host_block %d", rv); if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; /* Even when acquire did not get far enough to get a copy of the leader (!lver), we still want to try to release the leader in case we own it from another host committing our dblock. */ if (!r->leader.lver) rv = paxos_lease_release(task, token, NULL, NULL, &leader); else rv = paxos_lease_release(task, token, NULL, &r->leader, &leader); if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; /* want to see this result in sanlock.log but not worry people with error */ log_warnt(token, "release async erase all leader lver %llu rv %d", (unsigned long long)r->leader.lver, rv); } else if (r_flags & R_UNDO_SHARED) { rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async undo shared write_host_block %d", rv); if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; rv = release_disk(task, token, NULL, &r->leader); if (rv < 0) log_errot(token, "release async undo shared release leader %d", rv); if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; } else if (r_flags & R_SHARED) { /* normal release of sh lease */ rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async shared write_host_block %d", rv); if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; } else { /* normal release of ex lease */ if (r_flags & R_LVB_WRITE_RELEASE) { rv = write_lvb_block(task, r, token); if (!rv) r->flags &= ~R_LVB_WRITE_RELEASE; else log_errot(token, "release async write_lvb error %d", rv); /* do we want to give more effort to writing lvb? */ } /* Failure here is not a big deal and can be ignored. */ rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) log_errot(token, "release async write_host_block %d", rv); rv = release_disk(task, token, NULL, &r->leader); if (rv < 0) log_errot(token, "release async release leader %d", rv); if (rv == SANLK_AIO_TIMEOUT) retry_async = 1; } out_close: close_disks(token->disks, token->r.num_disks); out: if (!retry_async) { log_token(token, "release async done r_flags %x", r_flags); pthread_mutex_lock(&resource_mutex); list_del(&r->list); free_resource(r); pthread_mutex_unlock(&resource_mutex); return; } /* Keep the resource on the list to keep trying. */ log_token(token, "release async timeout r_flags %x", r_flags); pthread_mutex_lock(&resource_mutex); r->flags |= R_THREAD_RELEASE; pthread_mutex_unlock(&resource_mutex); } static void resource_thread_examine(struct task *task, struct token *tt, int pid, uint64_t lver) { struct request_record req; int rv; rv = open_disks_fd(tt->disks, tt->r.num_disks); if (rv < 0) { log_errot(tt, "examine open error %d", rv); return; } rv = examine_token(task, tt, &req); close_disks(tt->disks, tt->r.num_disks); if (rv != SANLK_OK) return; if (!req.force_mode || !req.lver) return; if (req.lver <= lver) { log_token(tt, "examine req lver %llu our lver %llu", (unsigned long long)req.lver, (unsigned long long)lver); return; } if (req.force_mode) { do_request(tt, pid, req.force_mode); } else { log_error("req force_mode %u unknown", req.force_mode); } } struct recv_he { struct list_head list; uint32_t space_id; uint64_t from_host_id; uint64_t from_generation; struct sanlk_host_event he; }; void add_host_event(uint32_t space_id, struct sanlk_host_event *he, uint64_t from_host_id, uint64_t from_generation) { struct recv_he *rhe; rhe = malloc(sizeof(struct recv_he)); if (!rhe) { log_error("add_host_event no mem"); return; } memset(rhe, 0, sizeof(struct recv_he)); memcpy(&rhe->he, he, sizeof(struct sanlk_host_event)); rhe->space_id = space_id; rhe->from_host_id = from_host_id; rhe->from_generation = from_generation; pthread_mutex_lock(&resource_mutex); list_add_tail(&rhe->list, &host_events); resource_thread_work = 1; pthread_cond_signal(&resource_cond); pthread_mutex_unlock(&resource_mutex); } static struct recv_he *find_host_event(void) { if (list_empty(&host_events)) return NULL; return list_first_entry(&host_events, struct recv_he, list); } static void *resource_thread(void *arg GNUC_UNUSED) { struct task task; struct resource *r; struct token *tt = NULL; struct recv_he *rhe; uint64_t lver; int pid, tt_len; memset(&task, 0, sizeof(struct task)); setup_task_aio(&task, main_task.use_aio, RESOURCE_AIO_CB_SIZE); sprintf(task.name, "%s", "resource"); /* a fake/tmp token struct we copy necessary res info into, because other functions take a token struct arg */ tt_len = sizeof(struct token) + (SANLK_MAX_DISKS * sizeof(struct sync_disk)); tt = malloc(tt_len); if (!tt) { log_error("resource_thread tt malloc error"); goto out; } while (1) { pthread_mutex_lock(&resource_mutex); while (!resource_thread_work) { if (resource_thread_stop) { pthread_mutex_unlock(&resource_mutex); goto out; } pthread_cond_wait(&resource_cond, &resource_mutex); } rhe = find_host_event(); if (rhe) { list_del(&rhe->list); pthread_mutex_unlock(&resource_mutex); send_event_callbacks(rhe->space_id, rhe->from_host_id, rhe->from_generation, &rhe->he); free(rhe); continue; } /* FIXME: it's not nice how we copy a bunch of stuff * from token to r so that we can later copy it back from * r into a temp token. The whole duplication of stuff * between token and r would be nice to clean up. */ memset(tt, 0, tt_len); tt->disks = (struct sync_disk *)&tt->r.disks[0]; r = find_resource_thread(&resources_rem, R_THREAD_RELEASE); if (r) { memcpy(&tt->r, &r->r, sizeof(struct sanlk_resource)); copy_disks(&tt->r.disks, &r->r.disks, r->r.num_disks); tt->host_id = r->host_id; tt->host_generation = r->host_generation; tt->res_id = r->res_id; tt->io_timeout = r->io_timeout; tt->sector_size = r->sector_size; tt->align_size = r->align_size; tt->resource = r; /* * Set the time after which we should try to release this * resource again if this current attempt times out. */ if (!r->thread_release_retry) r->thread_release_retry = monotime() + r->io_timeout; else r->thread_release_retry = monotime() + (r->io_timeout * 2); r->flags &= ~R_THREAD_RELEASE; pthread_mutex_unlock(&resource_mutex); resource_thread_release(&task, r, tt); continue; } /* * We don't want to search all of resource_held each time * we are woken unless we know there is something to examine. */ if (!resource_thread_work_examine) goto find_done; r = find_resource_thread(&resources_held, R_THREAD_EXAMINE); if (r) { /* make copies of things we need because we can't use r once we unlock the mutex since it could be released */ memcpy(&tt->r, &r->r, sizeof(struct sanlk_resource)); copy_disks(&tt->r.disks, &r->r.disks, r->r.num_disks); tt->host_id = r->host_id; tt->host_generation = r->host_generation; tt->res_id = r->res_id; tt->io_timeout = r->io_timeout; tt->sector_size = r->sector_size; tt->align_size = r->align_size; pid = r->pid; lver = r->leader.lver; r->flags &= ~R_THREAD_EXAMINE; pthread_mutex_unlock(&resource_mutex); resource_thread_examine(&task, tt, pid, lver); continue; } find_done: resource_thread_work = 0; resource_thread_work_examine = 0; pthread_mutex_unlock(&resource_mutex); } out: if (tt) free(tt); close_task_aio(&task); return NULL; } int release_orphan(struct sanlk_resource *res) { struct resource *r, *safe; int count = 0; pthread_mutex_lock(&resource_mutex); list_for_each_entry_safe(r, safe, &resources_orphan, list) { if (strncmp(r->r.lockspace_name, res->lockspace_name, NAME_ID_SIZE)) continue; if (!res->name[0] || !strncmp(r->r.name, res->name, NAME_ID_SIZE)) { log_debug("release orphan %.48s:%.48s", r->r.lockspace_name, r->r.name); r->flags |= R_THREAD_RELEASE; list_move(&r->list, &resources_rem); count++; } } if (count) { resource_thread_work = 1; pthread_cond_signal(&resource_cond); } pthread_mutex_unlock(&resource_mutex); return count; } static void purge_resource_list(struct list_head *head, char *space_name, const char *list_name) { struct resource *r, *safe; pthread_mutex_lock(&resource_mutex); list_for_each_entry_safe(r, safe, head, list) { if (strncmp(r->r.lockspace_name, space_name, NAME_ID_SIZE)) continue; if (list_name) log_debug("purge %s %.48s:%.48s", list_name, r->r.lockspace_name, r->r.name); list_del(&r->list); free(r); } pthread_mutex_unlock(&resource_mutex); } void purge_resource_orphans(char *space_name) { purge_resource_list(&resources_orphan, space_name, "orphan_list"); } void purge_resource_free(char *space_name) { purge_resource_list(&resources_free, space_name, "free_list"); } /* * This is called by the main_loop once a second during normal operation. * The resources_rem list should normally be empty, so this does nothing. * This is needed to wake up the resource_thread to retry release operations * that had timed out previously and need to be retried. */ void rem_resources(void) { pthread_mutex_lock(&resource_mutex); if (!list_empty(&resources_rem) && !resource_thread_work) { resource_thread_work = 1; pthread_cond_signal(&resource_cond); } pthread_mutex_unlock(&resource_mutex); } int setup_token_manager(void) { int rv; pthread_mutex_init(&resource_mutex, NULL); pthread_cond_init(&resource_cond, NULL); INIT_LIST_HEAD(&resources_add); INIT_LIST_HEAD(&resources_rem); INIT_LIST_HEAD(&resources_held); INIT_LIST_HEAD(&resources_free); INIT_LIST_HEAD(&resources_orphan); INIT_LIST_HEAD(&host_events); rv = pthread_create(&resource_pt, NULL, resource_thread, NULL); if (rv) return -1; return 0; } void close_token_manager(void) { pthread_mutex_lock(&resource_mutex); resource_thread_stop = 1; pthread_cond_signal(&resource_cond); pthread_mutex_unlock(&resource_mutex); pthread_join(resource_pt, NULL); } sanlock-3.8.2/src/resource.h000066400000000000000000000045031371427612200157540ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __RESOURCE_H__ #define __RESOURCE_H__ /* * We mostly avoid holding resource_mutex and spaces_mutex at once. When they * are held at once, the order is spaces_mutex, then resource_mutex. */ /* locks resource_mutex */ void send_state_resources(int fd); /* locks resource_mutex */ int lockspace_is_used(struct sanlk_lockspace *ls); /* locks resource_mutex */ int resource_orphan_count(char *space_name); /* no locks */ void check_mode_block(struct token *token, uint64_t next_lver, int q, char *dblock); /* locks resource_mutex */ int convert_token(struct task *task, struct sanlk_resource *res, struct token *cl_token, uint32_t cmd_flags); /* locks resource_mutex */ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, char *killpath, char *killargs); /* locks resource_mutex */ int release_token(struct task *task, struct token *token, struct sanlk_resource *resrename); /* locks resource_mutex */ void release_token_async(struct token *token); /* no locks */ int request_token(struct task *task, struct token *token, uint32_t force_mode, uint64_t *owner_id, int next_lver); /* locks resource_mutex */ int set_resource_examine(char *space_name, char *res_name); /* locks resource_mutex */ int res_set_lvb(struct sanlk_resource *res, char *lvb, int lvblen); /* locks resource_mutex */ int res_get_lvb(struct sanlk_resource *res, char **lvb_out, int *lvblen); /* no locks */ int read_resource_owners(struct task *task, struct token *token, struct sanlk_resource *res, char **send_buf, int *send_len, int *count); /* locks resource_mutex */ void rem_resources(void); /* locks resource_mutex */ int release_orphan(struct sanlk_resource *res); /* locks resource_mutex */ void purge_resource_orphans(char *space_name); void purge_resource_free(char *space_name); /* locks resource_mutex */ void add_host_event(uint32_t space_id, struct sanlk_host_event *he, uint64_t from_host_id, uint64_t from_generation); int setup_token_manager(void); void close_token_manager(void); #endif sanlock-3.8.2/src/rindex.c000066400000000000000000000733001371427612200154120ustar00rootroot00000000000000/* * Copyright 2018 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock_admin.h" #include "diskio.h" #include "ondisk.h" #include "log.h" #include "paxos_lease.h" #include "lockspace.h" #include "resource.h" #include "task.h" #include "timeouts.h" #include "rindex_disk.h" #include "rindex.h" #include "paxos_dblock.h" #include "leader.h" struct rindex_info { struct sanlk_rindex *ri; /* point to sanlk_rindex */ struct sync_disk *disk; /* points to sanlk_rindex.disk */ struct rindex_header header; }; /* this token is used for paxos_lease_acquire/release */ static struct token *setup_rindex_token(struct rindex_info *rx, int sector_size, int align_size, struct space_info *spi) { struct token *token; int token_len; token_len = sizeof(struct token) + sizeof(struct sync_disk); token = malloc(token_len); if (!token) return NULL; memset(token, 0, token_len); memcpy(token->r.lockspace_name, rx->ri->lockspace_name, SANLK_NAME_LEN); strcpy(token->r.name, "rindex_lease"); token->sector_size = sector_size; token->align_size = align_size; token->io_timeout = spi ? spi->io_timeout : DEFAULT_IO_TIMEOUT; token->r.num_disks = 1; token->r.flags |= sanlk_res_sector_size_to_flag(sector_size); token->r.flags |= sanlk_res_align_size_to_flag(align_size); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ memcpy(token->disks[0].path, rx->disk->path, SANLK_PATH_LEN); token->disks[0].offset = rx->disk->offset + align_size; token->disks[0].fd = rx->disk->fd; if (spi) { token->host_id = spi->host_id; token->host_generation = spi->host_generation; token->space_id = spi->space_id; token->res_id = 1; } return token; } /* this token is only used for paxos_lease_init */ static struct token *setup_resource_token(struct rindex_info *rx, char *res_name, int sector_size, int align_size, struct space_info *spi) { struct token *token; int token_len; token_len = sizeof(struct token) + sizeof(struct sync_disk); token = malloc(token_len); if (!token) return NULL; memset(token, 0, token_len); memcpy(token->r.lockspace_name, rx->ri->lockspace_name, SANLK_NAME_LEN); memcpy(token->r.name, res_name, SANLK_NAME_LEN); token->sector_size = sector_size; token->align_size = align_size; token->io_timeout = spi ? spi->io_timeout : DEFAULT_IO_TIMEOUT; token->r.num_disks = 1; token->r.flags |= sanlk_res_sector_size_to_flag(sector_size); token->r.flags |= sanlk_res_align_size_to_flag(align_size); token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ memcpy(token->disks[0].path, rx->disk->path, SANLK_PATH_LEN); token->disks[0].fd = rx->disk->fd; /* there is no offset yet, it is found and set later */ return token; } /* max resource entries supported by each combination of sector/align size */ static uint32_t size_to_max_resources(int sector_size, int align_size) { if ((sector_size == 512) && (align_size == ALIGN_SIZE_1M)) return 16000; if ((sector_size == 4096) && (align_size == ALIGN_SIZE_1M)) return 16000; if ((sector_size == 4096) && (align_size == ALIGN_SIZE_2M)) return 32000; if ((sector_size == 4096) && (align_size == ALIGN_SIZE_4M)) return 64000; if ((sector_size == 4096) && (align_size == ALIGN_SIZE_8M)) return 128000; /* this shouldn't happen */ return 16000; } static int search_entries(struct rindex_info *rx, char *rindex_iobuf, uint64_t *ent_offset, uint64_t *res_offset, int find_free, char *find_name) { struct rindex_entry re; struct rindex_entry *re_end; uint64_t entry_offset_in_rindex; uint32_t max_resources = rx->header.max_resources; int sector_size = rx->header.sector_size; int align_size = rindex_header_align_size_from_flag(rx->header.flags); int i; if (!max_resources) max_resources = size_to_max_resources(sector_size, align_size); for (i = 0; i < max_resources; i++) { /* skip first sector which holds header */ entry_offset_in_rindex = sector_size + (i * sizeof(struct rindex_entry)); re_end = (struct rindex_entry *)(rindex_iobuf + entry_offset_in_rindex); rindex_entry_in(re_end, &re); if (find_free && (!re.res_offset && !re.name[0])) { *ent_offset = entry_offset_in_rindex; *res_offset = rx->disk->offset + (2 * align_size) + (i * align_size); return 0; } if (find_name && re.name[0] && !strncmp(re.name, find_name, SANLK_NAME_LEN)) { *ent_offset = entry_offset_in_rindex; *res_offset = rx->disk->offset + (2 * align_size) + (i * align_size); return 0; } } return -ENOENT; } static int update_rindex(struct task *task, struct space_info *spi, struct rindex_info *rx, char *rindex_iobuf, struct sanlk_rentry *re, uint64_t ent_offset, uint64_t res_offset, int delete) { struct rindex_entry re_new; struct rindex_entry re_end; char *sector_iobuf; char **p_iobuf; uint32_t sector_offset; uint32_t entry_offset_in_sector; int sector_size = rx->header.sector_size; int iobuf_len; int rv; /* * ent_offset is the offset (in bytes) from the start of the rindex to * the entry being updated. (This includes the size of the header * sector; no offsets are calculated from the end of the header * sector.) * * sector_offset is the offset (in bytes) from the start of the rindex * to the sector containing ent_offset. The entire sector is written. * * entry_offset_in_sector is the offset (in bytes) from the start of * the target sector to the entry being updated. */ sector_offset = (ent_offset / sector_size) * sector_size; entry_offset_in_sector = ent_offset % sector_size; iobuf_len = sector_size; p_iobuf = §or_iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return rv; memset(sector_iobuf, 0, iobuf_len); memset(&re_new, 0, sizeof(struct rindex_entry)); if (!delete) { memcpy(re_new.name, re->name, NAME_ID_SIZE); re_new.res_offset = res_offset; } rindex_entry_out(&re_new, &re_end); /* initialize new sector with existing index content */ memcpy(sector_iobuf, rindex_iobuf + sector_offset, sector_size); /* replace the specific entry */ memcpy(sector_iobuf + entry_offset_in_sector, &re_end, sizeof(struct rindex_entry)); rv = write_iobuf(rx->disk->fd, rx->disk->offset + sector_offset, sector_iobuf, iobuf_len, task, spi->io_timeout, NULL); if (rv != SANLK_AIO_TIMEOUT) free(sector_iobuf); return rv; } static int read_rindex(struct task *task, struct space_info *spi, struct rindex_info *rx, char **rindex_iobuf_ret) { char *iobuf; char **p_iobuf; int align_size = rindex_header_align_size_from_flag(rx->header.flags); int iobuf_len; int rv; iobuf_len = align_size; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) { return rv; } memset(iobuf, 0, iobuf_len); rv = read_iobuf(rx->disk->fd, rx->disk->offset, iobuf, iobuf_len, task, spi->io_timeout, NULL); if (rv < 0) { free(iobuf); return rv; } *rindex_iobuf_ret = iobuf; return rv; } static int read_rindex_header(struct task *task, struct space_info *spi, struct rindex_info *rx) { struct rindex_header *rh_end; char *iobuf; char **p_iobuf; int sector_size = spi->sector_size; int io_timeout = spi->io_timeout; int iobuf_len; int rv; if (!sector_size) sector_size = 4096; if (!io_timeout) { io_timeout = DEFAULT_IO_TIMEOUT; spi->io_timeout = io_timeout; } /* * lockspace sector_size will usually be the same as rindex sector_size. * use the lockspace sector size for reading the rindex header which * officially gives us the rindex sector_size. */ iobuf_len = sector_size; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) return -ENOMEM; rv = read_iobuf(rx->disk->fd, rx->disk->offset, iobuf, iobuf_len, task, io_timeout, NULL); if (rv < 0) goto out; rh_end = (struct rindex_header *)iobuf; rindex_header_in(rh_end, &rx->header); if (rx->header.magic != RINDEX_DISK_MAGIC) { log_debug("rindex header bad magic %x vs %x on %s:%llu", rx->header.magic, RINDEX_DISK_MAGIC, rx->disk->path, (unsigned long long)rx->disk->offset); rv = SANLK_RINDEX_MAGIC; goto out; } if ((rx->header.version & 0xFFFF0000) != RINDEX_DISK_VERSION_MAJOR) { log_debug("rindex header bad version %x vs %x on %s:%llu", rx->header.version, RINDEX_DISK_VERSION_MAJOR, rx->disk->path, (unsigned long long)rx->disk->offset); rv = SANLK_RINDEX_VERSION; goto out; } if (strcmp(rx->header.lockspace_name, rx->ri->lockspace_name)) { log_debug("rindex header bad lockspace_name %.48s vs %.48s on %s:%llu", rx->header.lockspace_name, rx->ri->lockspace_name, rx->disk->path, (unsigned long long)rx->disk->offset); rv = SANLK_RINDEX_LOCKSPACE; goto out; } if (rx->header.rx_offset != rx->disk->offset) { log_debug("rindex header bad offset %llu on %s:%llu", (unsigned long long)rx->header.rx_offset, rx->disk->path, (unsigned long long)rx->disk->offset); rv = SANLK_RINDEX_OFFSET; goto out; } out: if (rv != SANLK_AIO_TIMEOUT) free(iobuf); return rv; } /* * format rindex: write new rindex header, and initialize internal paxos lease * for protecting the rindex. */ int rindex_format(struct task *task, struct sanlk_rindex *ri) { struct rindex_info rx; struct rindex_header rh; struct rindex_header rh_end; struct token *token; char *iobuf; char **p_iobuf; uint32_t max_resources; uint32_t max_resources_limit; int write_io_timeout; int sector_size = 0; int align_size = 0; int max_hosts = 0; int iobuf_len; int rv; memset(&rx, 0, sizeof(rx)); rx.ri = ri; rx.disk = (struct sync_disk *)&ri->disk; rv = open_disk(rx.disk); if (rv < 0) { log_error("rindex_format open failed %d %s", rv, rx.disk->path); return rv; } rv = sizes_from_flags(ri->flags, §or_size, &align_size, &max_hosts, "RIF"); if (rv) return rv; if (!sector_size) { /* sector/align flags were not set, use historical defaults */ sector_size = rx.disk->sector_size; align_size = sector_size_to_align_size_old(sector_size); max_hosts = DEFAULT_MAX_HOSTS; } /* * When unspecified, default to 4096 to limit the amount of searching. */ max_resources = rx.ri->max_resources; if (!max_resources) max_resources = 4096; max_resources_limit = size_to_max_resources(sector_size, align_size); if (max_resources > max_resources_limit) max_resources = max_resources_limit; log_debug("rindex_format %.48s:%s:%llu %d %d max_res %u", rx.ri->lockspace_name, rx.disk->path, (unsigned long long)rx.disk->offset, sector_size, align_size, max_resources); iobuf_len = align_size; p_iobuf = &iobuf; rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); if (rv) goto out_close; memset(iobuf, 0, iobuf_len); memset(&rh, 0, sizeof(struct rindex_header)); rh.magic = RINDEX_DISK_MAGIC; rh.version = RINDEX_DISK_VERSION_MAJOR | RINDEX_DISK_VERSION_MINOR; rh.flags = rindex_header_align_flag_from_size(align_size); rh.sector_size = sector_size; rh.max_resources = max_resources; rh.rx_offset = rx.disk->offset; strncpy(rh.lockspace_name, rx.ri->lockspace_name, NAME_ID_SIZE); memset(&rh_end, 0, sizeof(struct rindex_header)); rindex_header_out(&rh, &rh_end); memcpy(iobuf, &rh_end, sizeof(struct rindex_header)); if (com.write_init_io_timeout) write_io_timeout = com.write_init_io_timeout; else write_io_timeout = DEFAULT_IO_TIMEOUT; rv = write_iobuf(rx.disk->fd, rx.disk->offset, iobuf, iobuf_len, task, write_io_timeout, NULL); if (rv < 0) { log_error("rindex_format write failed %d %s", rv, rx.disk->path); goto out_iobuf; } token = setup_rindex_token(&rx, sector_size, align_size, NULL); if (!token) { rv = -ENOMEM; goto out_iobuf; } rv = paxos_lease_init(task, token, 0, 0); if (rv < 0) { log_error("rindex_format lease init failed %d", rv); goto out_token; } rv = 0; out_token: free(token); out_iobuf: if (rv != SANLK_AIO_TIMEOUT) free(iobuf); out_close: close_disks(rx.disk, 1); return rv; } int rindex_create(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t max_hosts, uint32_t num_hosts) { struct rindex_info rx; struct space_info spi; struct leader_record leader; struct paxos_dblock dblock; struct token *rx_token; struct token *res_token; char *rindex_iobuf = NULL; uint64_t ent_offset, res_offset; int sector_size, align_size; int rv; memset(&rx, 0, sizeof(rx)); rx.ri = ri; rx.disk = (struct sync_disk *)&ri->disk; rv = open_disk(rx.disk); if (rv < 0) { log_error("rindex_create open failed %d %s", rv, rx.disk->path); return rv; } /* * Allows only one rindex op for a given lockspace at a time. * If there's already one in progress, this returns EBUSY. * Also collects lockspace info at the same time. */ memset(&spi, 0, sizeof(spi)); rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_CREATE, &spi); if (rv < 0) { log_error("rindex_create lockspace not available %d %.48s", rv, ri->lockspace_name); goto out_close; } rv = read_rindex_header(task, &spi, &rx); if (rv < 0) { log_error("rindex_create failed to read rindex header %d on %s:%llu", rv, rx.disk->path, (unsigned long long)rx.disk->offset); goto out_clear; } sector_size = rx.header.sector_size; align_size = rindex_header_align_size_from_flag(rx.header.flags); log_debug("rindex_create %.48s:%s:%llu %d %d max_res %u", rx.ri->lockspace_name, rx.disk->path, (unsigned long long)rx.disk->offset, sector_size, align_size, rx.header.max_resources); /* used to acquire the internal paxos lease protecting the rindex */ rx_token = setup_rindex_token(&rx, sector_size, align_size, &spi); if (!rx_token) { rv = -ENOMEM; goto out_clear; } /* used to initialize the new paxos lease for the resource */ res_token = setup_resource_token(&rx, re->name, sector_size, align_size, &spi); if (!res_token) { free(rx_token); rv = -ENOMEM; goto out_clear; } log_debug("rindex_create acquire offset %llu sector_size %d align_size %d", (unsigned long long)rx_token->disks[0].offset, rx_token->sector_size, rx_token->align_size); rv = paxos_lease_acquire(task, rx_token, PAXOS_ACQUIRE_OWNER_NOWAIT | PAXOS_ACQUIRE_QUIET_FAIL, &leader, &dblock, 0, 0); if (rv < 0) { /* TODO: sleep and retry if this fails because it's held by another host? */ log_error("rindex_create failed to acquire rindex lease %d", rv); goto out_token; } rv = read_rindex(task, &spi, &rx, &rindex_iobuf); if (rv < 0) { log_error("rindex_create failed to read rindex %d", rv); goto out_lease; } rv = search_entries(&rx, rindex_iobuf, &ent_offset, &res_offset, 1, NULL); if (rv < 0) { log_error("rindex_create failed to find free offset %d", rv); goto out_iobuf; } /* set the location of the new paxos lease */ log_debug("rindex_create found offset %llu for %.48s:%.48s", (unsigned long long)res_offset, rx.ri->lockspace_name, re->name); res_token->disks[0].offset = res_offset; /* write the new paxos lease */ rv = paxos_lease_init(task, res_token, num_hosts, 0); if (rv < 0) { log_error("rindex_create failed to init new lease %d", rv); goto out_iobuf; } rv = update_rindex(task, &spi, &rx, rindex_iobuf, re, ent_offset, res_offset, 0); if (rv < 0) { log_error("rindex_create failed to update rindex %d", rv); goto out_iobuf; } log_debug("rindex_create updated rindex entry %llu for %.48s %llu", (unsigned long long)ent_offset, re->name, (unsigned long long)res_offset); re_ret->offset = res_offset; rv = 0; out_iobuf: free(rindex_iobuf); out_lease: paxos_lease_release(task, rx_token, NULL, &leader, &leader); out_token: free(rx_token); free(res_token); out_clear: lockspace_clear_rindex_op(ri->lockspace_name); out_close: close_disks(rx.disk, 1); return rv; } /* * clear the rindex entry for a given resource lease name and offset * first the rentry is cleared, then the resource lease is cleared */ int rindex_delete(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret) { struct rindex_info rx; struct space_info spi; struct leader_record leader; struct paxos_dblock dblock; struct token *rx_token; struct token *res_token; char *rindex_iobuf = NULL; uint64_t res_offset = re->offset; uint64_t ent_offset; int sector_size, align_size; int rv; memset(&rx, 0, sizeof(rx)); rx.ri = ri; rx.disk = (struct sync_disk *)&ri->disk; rv = open_disk(rx.disk); if (rv < 0) { log_error("rindex_create open failed %d %s", rv, rx.disk->path); return rv; } /* * Allows only one rindex op for a given lockspace at a time. * If there's already one in progress, this returns EBUSY. * Also collects lockspace info at the same time. */ memset(&spi, 0, sizeof(spi)); rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_DELETE, &spi); if (rv < 0) { log_error("rindex_delete lockspace not available %d %.48s", rv, ri->lockspace_name); goto out_close; } rv = read_rindex_header(task, &spi, &rx); if (rv < 0) { log_error("rindex_delete failed to read rindex header %d on %s:%llu", rv, rx.disk->path, (unsigned long long)rx.disk->offset); goto out_clear; } sector_size = rx.header.sector_size; align_size = rindex_header_align_size_from_flag(rx.header.flags); /* resource lease locations must use the same alignment as the rindex */ if (re->offset && (re->offset % align_size)) { rv = SANLK_RINDEX_OFFSET; goto out_clear; } /* used to acquire the internal paxos lease protecting the rindex */ rx_token = setup_rindex_token(&rx, sector_size, align_size, &spi); if (!rx_token) { rv = -ENOMEM; goto out_clear; } /* used to write the cleared paxos lease for the resource */ res_token = setup_resource_token(&rx, re->name, sector_size, align_size, &spi); if (!res_token) { free(rx_token); rv = -ENOMEM; goto out_clear; } rv = paxos_lease_acquire(task, rx_token, PAXOS_ACQUIRE_OWNER_NOWAIT | PAXOS_ACQUIRE_QUIET_FAIL, &leader, &dblock, 0, 0); if (rv < 0) { /* TODO: sleep and retry if this fails because it's held by another host? */ log_error("rindex_create failed to acquire rindex lease %d", rv); goto out_token; } rv = read_rindex(task, &spi, &rx, &rindex_iobuf); if (rv < 0) { log_error("rindex_delete failed to read rindex %d", rv); goto out_lease; } /* find the entry */ rv = search_entries(&rx, rindex_iobuf, &ent_offset, &res_offset, 0, re->name); if (rv < 0) { log_error("rindex_delete failed to find entry '%s': %d", re->name, rv); goto out_iobuf; } rv = update_rindex(task, &spi, &rx, rindex_iobuf, re, ent_offset, res_offset, 1); if (rv < 0) { log_error("rindex_delete failed to update rindex %d", rv); goto out_iobuf; } /* clear the paxos lease */ res_token->disks[0].offset = res_offset; rv = paxos_lease_init(task, res_token, 0, 1); if (rv < 0) { log_error("rindex_delete failed to init new lease %d", rv); goto out_iobuf; } log_debug("rindex_delete updated rindex entry %llu for %.48s %llu", (unsigned long long)ent_offset, re->name, (unsigned long long)res_offset); re_ret->offset = 0; rv = 0; out_iobuf: free(rindex_iobuf); out_lease: paxos_lease_release(task, rx_token, NULL, &leader, &leader); out_token: free(rx_token); free(res_token); out_clear: lockspace_clear_rindex_op(ri->lockspace_name); out_close: close_disks(rx.disk, 1); return rv; } int rindex_lookup(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t cmd_flags) { struct rindex_info rx; struct space_info spi; struct rindex_entry re_in; struct rindex_entry *re_end; char *rindex_iobuf = NULL; uint64_t ent_offset, res_offset; int entry_num; int sector_size, align_size; int nolock = cmd_flags & SANLK_RX_NO_LOCKSPACE; int rv; memset(&rx, 0, sizeof(rx)); rx.ri = ri; rx.disk = (struct sync_disk *)&ri->disk; rv = open_disk(rx.disk); if (rv < 0) { return rv; } memset(&spi, 0, sizeof(spi)); if (!nolock) { rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_LOOKUP, &spi); if (rv < 0) { goto out_close; } } rv = read_rindex_header(task, &spi, &rx); if (rv < 0) { goto out_clear; } sector_size = rx.header.sector_size; align_size = rindex_header_align_size_from_flag(rx.header.flags); rv = read_rindex(task, &spi, &rx, &rindex_iobuf); if (rv < 0) { goto out_clear; } if (re->offset && (re->offset % align_size)) { rv = SANLK_RINDEX_OFFSET; goto out_clear; } if (!re->name[0] && !re->offset) { /* find the first free resource lease offset */ rv = search_entries(&rx, rindex_iobuf, &ent_offset, &res_offset, 1, NULL); if (rv < 0) { goto out_iobuf; } memset(re_ret->name, 0, SANLK_NAME_LEN); re_ret->offset = res_offset; rv = 0; } else if (!re->name[0] && re->offset) { /* find the name of the resource lease that the index has recorded for the given resource lease offset */ res_offset = re->offset; entry_num = (res_offset - rx.disk->offset - (2 * align_size)) / align_size; ent_offset = sector_size + (entry_num * sizeof(struct rindex_entry)); re_end = (struct rindex_entry *)(rindex_iobuf + ent_offset); rindex_entry_in(re_end, &re_in); memcpy(re_ret->name, re_in.name, SANLK_NAME_LEN); re_ret->offset = res_offset; rv = 0; } else if (re->name[0] && !re->offset) { /* search the rindex entries for a given resource lease name and if found return the offset of the resource lease */ rv = search_entries(&rx, rindex_iobuf, &ent_offset, &res_offset, 0, re->name); if (rv < 0) { goto out_iobuf; } memcpy(re_ret->name, re->name, SANLK_NAME_LEN); re_ret->offset = res_offset; rv = 0; } else if (re->name[0] && re->offset) { /* find the name of the resource lease that the index has recorded for the given resource lease offset, and if it doesn't match the specified name, then it's an error */ res_offset = re->offset; entry_num = (res_offset - rx.disk->offset - (2 * align_size)) / align_size; ent_offset = sector_size + (entry_num * sizeof(struct rindex_entry)); re_end = (struct rindex_entry *)(rindex_iobuf + ent_offset); rindex_entry_in(re_end, &re_in); if (strncmp(re->name, re_in.name, SANLK_NAME_LEN)) rv = SANLK_RINDEX_DIFF; else rv = 0; memcpy(re_ret->name, re_in.name, SANLK_NAME_LEN); re_ret->offset = res_offset; } out_iobuf: free(rindex_iobuf); out_clear: if (!nolock) lockspace_clear_rindex_op(ri->lockspace_name); out_close: close_disks(rx.disk, 1); return rv; } int rindex_update(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t cmd_flags) { struct rindex_info rx; struct space_info spi; char *rindex_iobuf = NULL; uint64_t ent_offset, res_offset; int entry_num; int sector_size, align_size; int op_remove = 0, op_add = 0; int nolock = cmd_flags & SANLK_RX_NO_LOCKSPACE; int rv; memset(&rx, 0, sizeof(rx)); rx.ri = ri; rx.disk = (struct sync_disk *)&ri->disk; rv = open_disk(rx.disk); if (rv < 0) { return rv; } memset(&spi, 0, sizeof(spi)); if (!nolock) { rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_UPDATE, &spi); if (rv < 0) { goto out_close; } } rv = read_rindex_header(task, &spi, &rx); if (rv < 0) { goto out_clear; } rv = read_rindex(task, &spi, &rx, &rindex_iobuf); if (rv < 0) { goto out_clear; } sector_size = rx.header.sector_size; align_size = rindex_header_align_size_from_flag(rx.header.flags); if (re->offset && (re->offset % align_size)) { rv = SANLK_RINDEX_OFFSET; goto out_clear; } res_offset = re->offset; entry_num = (res_offset - rx.disk->offset - (2 * align_size)) / align_size; ent_offset = sector_size + (entry_num * sizeof(struct rindex_entry)); if ((cmd_flags & SANLK_RXUP_REM) && re->offset) { op_remove = 1; } else if ((cmd_flags & SANLK_RXUP_ADD) && re->name[0] && re->offset) { op_add = 1; } else { rv = -EINVAL; goto out_iobuf; } rv = update_rindex(task, &spi, &rx, rindex_iobuf, re, ent_offset, res_offset, op_remove); if (rv < 0) { log_error("rindex_update failed to update rindex %d", rv); goto out_iobuf; } rv = 0; if (op_remove) { memset(re_ret->name, 0, SANLK_NAME_LEN); re_ret->offset = 0; } if (op_add) { memcpy(re_ret->name, re->name, SANLK_NAME_LEN); re_ret->offset = res_offset; } out_iobuf: free(rindex_iobuf); out_clear: if (!nolock) lockspace_clear_rindex_op(ri->lockspace_name); out_close: close_disks(rx.disk, 1); return rv; } int rindex_rebuild(struct task *task, struct sanlk_rindex *ri, uint32_t cmd_flags) { struct rindex_info rx; struct rindex_entry re_new; struct rindex_entry re_end; struct space_info spi; struct leader_record leader; struct paxos_dblock dblock; struct token *rx_token; struct token *res_token; struct sanlk_resource res; char off_str[16]; char *rindex_iobuf = NULL; uint64_t res_offset; uint64_t ent_offset; uint32_t max_resources; int sector_size, align_size; int nolock = cmd_flags & SANLK_RX_NO_LOCKSPACE; int i, rv; memset(&rx, 0, sizeof(rx)); rx.ri = ri; rx.disk = (struct sync_disk *)&ri->disk; rv = open_disk(rx.disk); if (rv < 0) { log_error("rindex_rebuild open failed %d %s", rv, rx.disk->path); return rv; } /* * Allows only one rindex op for a given lockspace at a time. * If there's already one in progress, this returns EBUSY. * Also collects lockspace info at the same time. */ memset(&spi, 0, sizeof(spi)); if (!nolock) { rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_REBUILD, &spi); if (rv < 0) { log_error("rindex_rebuild lockspace not available %d %.48s", rv, ri->lockspace_name); goto out_close; } } rv = read_rindex_header(task, &spi, &rx); if (rv < 0) { log_error("rindex_rebuild failed to read rindex header %d on %s:%llu", rv, rx.disk->path, (unsigned long long)rx.disk->offset); goto out_clear; } sector_size = rx.header.sector_size; align_size = rindex_header_align_size_from_flag(rx.header.flags); max_resources = rx.header.max_resources; if (!max_resources) max_resources = size_to_max_resources(sector_size, align_size); log_debug("rindex_rebuild %.48s:%s:%llu %d %d max_res %u", rx.ri->lockspace_name, rx.disk->path, (unsigned long long)rx.disk->offset, sector_size, align_size, max_resources); /* used to acquire the internal paxos lease protecting the rindex */ rx_token = setup_rindex_token(&rx, sector_size, align_size, &spi); if (!rx_token) { rv = -ENOMEM; goto out_clear; } memset(&res, 0, sizeof(res)); res_token = setup_resource_token(&rx, res.name, sector_size, align_size, &spi); if (!res_token) { free(rx_token); rv = -ENOMEM; goto out_clear; } if (!nolock) { rv = paxos_lease_acquire(task, rx_token, PAXOS_ACQUIRE_OWNER_NOWAIT | PAXOS_ACQUIRE_QUIET_FAIL, &leader, &dblock, 0, 0); if (rv < 0) { /* TODO: sleep and retry if this fails because it's held by another host? */ log_error("rindex_rebuild failed to acquire rindex lease %d", rv); goto out_token; } } rv = read_rindex(task, &spi, &rx, &rindex_iobuf); if (rv < 0) { log_error("rindex_rebuild failed to read rindex %d", rv); goto out_lease; } /* * Zero all the entries after the header sector. Entries will be * recreated in the zeroed space if corresponding resource leases are * found. */ memset(rindex_iobuf + sector_size, 0, align_size - sector_size); /* * We read each potential resource lease offset to check if a * lease exists there. It's ok if there is none, and we don't * want to log errors if none is found. */ res_token->flags |= T_CHECK_EXISTS; /* * Read each potential resource lease area and add an rindex entry * for each one that's found. Resource leases begin after * the rindex area and the rindex lease area. */ res_offset = rx.disk->offset + (2 * align_size); for (i = 0; i < max_resources; i++) { memset(&re_new, 0, sizeof(re_new)); memset(&re_end, 0, sizeof(re_end)); memset(&res, 0, sizeof(res)); memset(res_token->r.name, 0, SANLK_NAME_LEN); res_token->disks[0].offset = res_offset; rv = paxos_read_resource(task, res_token, &res); offset_to_str(res_offset, sizeof(off_str), off_str); /* end of device */ if (rv == -EMSGSIZE) { log_debug("rindex_rebuild reached end of device at %d %s", i, off_str); break; } if (rv == SANLK_OK) { log_debug("rindex_rebuild found %.48s at %d %s", res.name, i, off_str); re_new.res_offset = res_offset; memcpy(re_new.name, res.name, SANLK_NAME_LEN); rindex_entry_out(&re_new, &re_end); /* Within rindex, entries begin after the header sector */ ent_offset = sector_size + (i * sizeof(struct rindex_entry)); memcpy(rindex_iobuf + ent_offset, &re_end, sizeof(re_end)); } else if ((i + 1) == max_resources) { log_debug("rindex_rebuild found no resource at last %d %s %d", i, off_str, rv); } res_offset += align_size; } rv = write_iobuf(rx.disk->fd, rx.disk->offset, rindex_iobuf, align_size, task, spi.io_timeout, NULL); if (rv < 0) { if (rv != SANLK_AIO_TIMEOUT) free(rindex_iobuf); log_error("rindex_rebuild write failed %d %s", rv, rx.disk->path); goto out_lease; } rv = 0; free(rindex_iobuf); out_lease: if (!nolock) paxos_lease_release(task, rx_token, NULL, &leader, &leader); out_token: free(rx_token); free(res_token); out_clear: if (!nolock) lockspace_clear_rindex_op(ri->lockspace_name); out_close: close_disks(rx.disk, 1); return rv; } sanlock-3.8.2/src/rindex.h000066400000000000000000000021021371427612200154070ustar00rootroot00000000000000/* * Copyright 2018 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __RINDEX_H__ #define __RINDEX_H__ int rindex_format(struct task *task, struct sanlk_rindex *ri); int rindex_rebuild(struct task *task, struct sanlk_rindex *ri, uint32_t cmd_flags); int rindex_lookup(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t cmd_flags); int rindex_update(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t cmd_flags); int rindex_create(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t num_hosts, uint32_t max_hosts); int rindex_delete(struct task *task, struct sanlk_rindex *ri, struct sanlk_rentry *re, struct sanlk_rentry *re_ret); #endif sanlock-3.8.2/src/rindex_disk.h000066400000000000000000000070661371427612200164370ustar00rootroot00000000000000/* * Copyright 2018 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __RINDEX_DISK_H__ #define __RINDEX_DISK_H__ /* * The resource index uses two align-size areas: * * 1. The first area (the rindex itself) holds a header and * entries. with each entry recording a resource lease name * and the offset of that lease (the resource lease disk areads * follow the two align-size disk areas used by the resource index.) * * 2. The second area holds an internal paxos lease that sanlock * uses to protect updates to the rindex in the first area. * * The rindex is one align-size area containing between 256 and * 2048 sectors, depending on the sector_size and align_size. * * sector 0 of the index holds the rindex_header. * After this, sectors 1 to 250/500/1000/2000 hold rindex_entry's. * The remaining sectors in the align-size area are unused. * * 512 byte sectors hold 8 entries per sector, * 4096 byte sectors hold 64 entries per sector. * * ALIGN1M / SECTOR512 = 2000 sectors used for rindex, 16000 max entries * ALIGN1M / SECTOR4K = 250 sectors used for rindex, 16000 max entries * ALIGN2M / SECTOR4K = 500 sectors used for rindex, 32000 max entries * ALIGN4M / SECTOR4K = 1000 sectors used for rindex, 64000 max entries * ALIGN8M / SECTOR4K = 2000 sectors used for rindex, 128000 max entries * * rindex_header.sector_size = 512 | 4096 * * area_size = 1M | 2M | 4M | 8M * * rindex_header.max_resources defaults to 4096 to limit searching. * The caller can specify max_resources up to the max supported by * the sector_size/align_size combination. * * rindex_header.rindex_offset: * location of rindex_header from start of device, set by caller, * must be multiple of area_size. (rindex_offset will often be * 1*area_size because rindex typically follows the lockspace area * which typically starts at offset 0 on the device.) * * entry_size = 64 bytes * * entry_index = N = 0 to (max_resources - 1) * * rindex_entry N offset = rindex_offset + sector_size + (N * entry_size) * (the sector_size contains the rindex_header) * * rindex_entry N holds information about the resource lease in * the N'th area following the two areas used by the resource index. * * resource_leases_start = rindex_offset + (2 * area_size) * resource leases begin after the two resource index areas. * (rindex_offset will often be area_size, so resource_leases_start * will often by 3*area_size) * * resource lease N offset = resource_leases_start + (N * area_size) * * rindex_entry[N].res_offset = resource lease N offset */ #define RINDEX_DISK_MAGIC 0x01042018 #define RINDEX_DISK_VERSION_MAJOR 0x00010000 #define RINDEX_DISK_VERSION_MINOR 0x00000002 /* MINOR 2: addition of align flags */ /* rindex_header flags */ #define RHF_ALIGN_1M 0x00000001 #define RHF_ALIGN_2M 0x00000002 #define RHF_ALIGN_4M 0x00000004 #define RHF_ALIGN_8M 0x00000008 struct rindex_header { uint32_t magic; uint32_t version; uint32_t flags; /* RHF_ */ uint32_t sector_size; uint32_t max_resources; uint32_t unused; uint64_t rx_offset; /* location of rindex_header from start of disk */ char lockspace_name[NAME_ID_SIZE]; }; #define MAX_RINDEX_ENTRIES_1M 16000 #define MAX_RINDEX_ENTRIES_8M 128000 /* The entry size is fixed */ struct rindex_entry { uint64_t res_offset; /* location of resource from start of disk */ uint32_t flags; uint32_t unused; char name[NAME_ID_SIZE]; }; #endif sanlock-3.8.2/src/sanlock.8000066400000000000000000001335251371427612200155060ustar00rootroot00000000000000.TH SANLOCK 8 2015-01-23 .SH NAME sanlock \- shared storage lock manager .SH SYNOPSIS .B sanlock [COMMAND] [ACTION] ... .SH DESCRIPTION sanlock is a lock manager built on shared storage. Hosts with access to the storage can perform locking. An application running on the hosts is given a small amount of space on the shared block device or file, and uses sanlock for its own application-specific synchronization. Internally, the sanlock daemon manages locks using two disk-based lease algorithms: delta leases and paxos leases. .IP \[bu] 2 .I delta leases are slow to acquire and demand regular i/o to shared storage. sanlock only uses them internally to hold a lease on its "host_id" (an integer host identifier from 1-2000). They prevent two hosts from using the same host identifier. The delta lease renewals also indicate if a host is alive. ("Light-Weight Leases for Storage-Centric Coordination", Chockler and Malkhi.) .IP \[bu] .I paxos leases are fast to acquire and sanlock makes them available to applications as general purpose resource leases. The disk paxos algorithm uses host_id's internally to represent different hosts, and the owner of a paxos lease. delta leases provide unique host_id's for implementing paxos leases, and delta lease renewals serve as a proxy for paxos lease renewal. ("Disk Paxos", Eli Gafni and Leslie Lamport.) .P Externally, the sanlock daemon exposes a locking interface through libsanlock in terms of "lockspaces" and "resources". A lockspace is a locking context that an application creates for itself on shared storage. When the application on each host is started, it "joins" the lockspace. It can then create "resources" on the shared storage. Each resource represents an application-specific entity. The application can acquire and release leases on resources. To use sanlock from an application: .IP \[bu] 2 Allocate shared storage for an application, e.g. a shared LUN or LV from a SAN, or files from NFS. .IP \[bu] Provide the storage to the application. .IP \[bu] The application uses this storage with libsanlock to create a lockspace and resources for itself. .IP \[bu] The application joins the lockspace when it starts. .IP \[bu] The application acquires and releases leases on resources. .P How lockspaces and resources translate to delta leases and paxos leases within sanlock: .I Lockspaces .IP \[bu] 2 A lockspace is based on delta leases held by each host using the lockspace. .IP \[bu] A lockspace is a series of 2000 delta leases on disk, and requires 1MB of storage. (See Storage below for size variations.) .IP \[bu] A lockspace can support up to 2000 concurrent hosts using it, each using a different delta lease. .IP \[bu] Applications can i) create, ii) join and iii) leave a lockspace, which corresponds to i) initializing the set of delta leases on disk, ii) acquiring one of the delta leases and iii) releasing the delta lease. .IP \[bu] When a lockspace is created, a unique lockspace name and disk location is provided by the application. .IP \[bu] When a lockspace is created/initialized, sanlock formats the sequence of 2000 on-disk delta lease structures on the file or disk, e.g. /mnt/leasefile (NFS) or /dev/vg/lv (SAN). .IP \[bu] The 2000 individual delta leases in a lockspace are identified by number: 1,2,3,...,2000. .IP \[bu] Each delta lease is a 512 byte sector in the 1MB lockspace, offset by its number, e.g. delta lease 1 is offset 0, delta lease 2 is offset 512, delta lease 2000 is offset 1023488. (See Storage below for size variations.) .IP \[bu] When an application joins a lockspace, it must specify the lockspace name, the lockspace location on shared disk/file, and the local host's host_id. sanlock then acquires the delta lease corresponding to the host_id, e.g. joining the lockspace with host_id 1 acquires delta lease 1. .IP \[bu] The terms delta lease, lockspace lease, and host_id lease are used interchangably. .IP \[bu] sanlock acquires a delta lease by writing the host's unique name to the delta lease disk sector, reading it back after a delay, and verifying it is the same. .IP \[bu] If a unique host name is not specified, sanlock generates a uuid to use as the host's name. The delta lease algorithm depends on hosts using unique names. .IP \[bu] The application on each host should be configured with a unique host_id, where the host_id is an integer 1-2000. .IP \[bu] If hosts are misconfigured and have the same host_id, the delta lease algorithm is designed to detect this conflict, and only one host will be able to acquire the delta lease for that host_id. .IP \[bu] A delta lease ensures that a lockspace host_id is being used by a single host with the unique name specified in the delta lease. .IP \[bu] Resolving delta lease conflicts is slow, because the algorithm is based on waiting and watching for some time for other hosts to write to the same delta lease sector. If multiple hosts try to use the same delta lease, the delay is increased substantially. So, it is best to configure applications to use unique host_id's that will not conflict. .IP \[bu] After sanlock acquires a delta lease, the lease must be renewed until the application leaves the lockspace (which corresponds to releasing the delta lease on the host_id.) .IP \[bu] sanlock renews delta leases every 20 seconds (by default) by writing a new timestamp into the delta lease sector. .IP \[bu] When a host acquires a delta lease in a lockspace, it can be referred to as "joining" the lockspace. Once it has joined the lockspace, it can use resources associated with the lockspace. .P .I Resources .IP \[bu] 2 A lockspace is a context for resources that can be locked and unlocked by an application. .IP \[bu] sanlock uses paxos leases to implement leases on resources. The terms paxos lease and resource lease are used interchangably. .IP \[bu] A paxos lease exists on shared storage and requires 1MB of space. It contains a unique resource name and the name of the lockspace. .IP \[bu] An application assigns its own meaning to a sanlock resource and the leases on it. A sanlock resource could represent some shared object like a file, or some unique role among the hosts. .IP \[bu] Resource leases are associated with a specific lockspace and can only be used by hosts that have joined that lockspace (they are holding a delta lease on a host_id in that lockspace.) .IP \[bu] An application must keep track of the disk locations of its lockspaces and resources. sanlock does not maintain any persistent index or directory of lockspaces or resources that have been created by applications, so applications need to remember where they have placed their own leases (which files or disks and offsets). .IP \[bu] sanlock does not renew paxos leases directly (although it could). Instead, the renewal of a host's delta lease represents the renewal of all that host's paxos leases in the associated lockspace. In effect, many paxos lease renewals are factored out into one delta lease renewal. This reduces i/o when many paxos leases are used. .IP \[bu] The disk paxos algorithm allows multiple hosts to all attempt to acquire the same paxos lease at once, and will produce a single winner/owner of the resource lease. (Shared resource leases are also possible in addition to the default exclusive leases.) .IP \[bu] The disk paxos algorithm involves a specific sequence of reading and writing the sectors of the paxos lease disk area. Each host has a dedicated 512 byte sector in the paxos lease disk area where it writes its own "ballot", and each host reads the entire disk area to see the ballots of other hosts. The first sector of the disk area is the "leader record" that holds the result of the last paxos ballot. The winner of the paxos ballot writes the result of the ballot to the leader record (the winner of the ballot may have selected another contending host as the owner of the paxos lease.) .IP \[bu] After a paxos lease is acquired, no further i/o is done in the paxos lease disk area. .IP \[bu] Releasing the paxos lease involves writing a single sector to clear the current owner in the leader record. .IP \[bu] If a host holding a paxos lease fails, the disk area of the paxos lease still indicates that the paxos lease is owned by the failed host. If another host attempts to acquire the paxos lease, and finds the lease is held by another host_id, it will check the delta lease of that host_id. If the delta lease of the host_id is being renewed, then the paxos lease is owned and cannot be acquired. If the delta lease of the owner's host_id has expired, then the paxos lease is expired and can be taken (by going through the paxos lease algorithm.) .IP \[bu] The "interaction" or "awareness" between hosts of each other is limited to the case where they attempt to acquire the same paxos lease, and need to check if the referenced delta lease has expired or not. .IP \[bu] When hosts do not attempt to lock the same resources concurrently, there is no host interaction or awareness. The state or actions of one host have no effect on others. .IP \[bu] To speed up checking delta lease expiration (in the case of a paxos lease conflict), sanlock keeps track of past renewals of other delta leases in the lockspace. .P .I Resource Index The resource index (rindex) is an optional sanlock feature that applications can use to keep track of resource lease offsets. Without the rindex, an application must keep track of where its resource leases exist on disk and find available locations when creating new leases. The sanlock rindex uses two align-size areas on disk following the lockspace. The first area holds rindex entries; each entry records a resource lease name and location. The second area holds a private paxos lease, used by sanlock internally to protect rindex updates. The application creates the rindex on disk with the "format" function. Format is a disk-only operation and does not interact with the live lockspace, so it can be called without first calling add_lockspace. The application needs to follow the convention of writing the lockspace at the start of the device (offset 0) and formatting the rindex immediately following the lockspace area. When formatting, the application must set flags for sector size and align size to match those for the lockspace. To use the rindex, the application: .IP \[bu] 2 Uses the "create" function to create a new resource lease on disk. This takes the place of the write_resource function. The create function requires the location of the rindex and the name of the new resource lease. sanlock finds a free lease area, writes the new resource lease at that location, updates the rindex with the name:offset, and returns the offset to the caller. The caller uses this offset when acquiring the resource lease. .IP \[bu] Uses the "delete" function to remove a resource disk on disk (also corresponding to the write_resource function.) sanlock clears the resource lease and the rindex entry for it. A subsequent call to create may use this same disk location for a different resource lease. .IP \[bu] Uses the "lookup" function to discover the offset of a resource lease given the resource lease name. The caller would typically call this prior to acquiring the resource lease. .IP \[bu] Uses the "rebuild" function to recreate the rindex if it is damaged or becomes inconsistent. This function scans the disk for resource leases and creates new rindex entries to match the leases it finds. .IP \[bu] The "update" function manipulates rindex entries directly and should not normally be used by the application. In normal usage, the create and delete functions manipulate rindex entries. Update is mainly useful for testing or repairs. .P .I Expiration .IP \[bu] 2 If a host fails to renew its delta lease, e.g. it looses access to the storage, its delta lease will eventually expire and another host will be able to take over any resource leases held by the host. sanlock must ensure that the application on two different hosts is not holding and using the same lease concurrently. .IP \[bu] When sanlock has failed to renew a delta lease for a period of time, it will begin taking measures to stop local processes (applications) from using any resource leases associated with the expiring lockspace delta lease. sanlock enters this "recovery mode" well ahead of the time when another host could take over the locally owned leases. sanlock must have sufficient time to stop all local processes that are using the expiring leases. .IP \[bu] sanlock uses three methods to stop local processes that are using expiring leases: 1. Graceful shutdown. sanlock will execute a "graceful shutdown" program that the application previously specified for this case. The shutdown program tells the application to shut down because its leases are expiring. The application must respond by stopping its activities and releasing its leases (or exit). If an application does not specify a graceful shutdown program, sanlock sends SIGTERM to the process instead. The process must release its leases or exit in a prescribed amount of time (see -g), or sanlock proceeds to the next method of stopping. 2. Forced shutdown. sanlock will send SIGKILL to processes using the expiring leases. The processes have a fixed amount of time to exit after receiving SIGKILL. If any do not exit in this time, sanlock will proceed to the next method. 3. Host reset. sanlock will trigger the host's watchdog device to forcibly reset it. sanlock carefully manages the timing of the watchdog device so that it fires shortly before any other host could take over the resource leases held by local processes. .P .I Failures If a process holding resource leases fails or exits without releasing its leases, sanlock will release the leases for it automatically (unless persistent resource leases were used.) If the sanlock daemon cannot renew a lockspace delta lease for a specific period of time (see Expiration), sanlock will enter "recovery mode" where it attempts to stop and/or kill any processes holding resource leases in the expiring lockspace. If the processes do not exit in time, sanlock will force the host to be reset using the local watchdog device. If the sanlock daemon crashes or hangs, it will not renew the expiry time of the per-lockspace connections it had to the wdmd daemon. This will lead to the expiration of the local watchdog device, and the host will be reset. .I Watchdog sanlock uses the wdmd(8) daemon to access /dev/watchdog. wdmd multiplexes multiple timeouts onto the single watchdog timer. This is required because delta leases for each lockspace are renewed and expire independently. sanlock maintains a wdmd connection for each lockspace delta lease being renewed. Each connection has an expiry time for some seconds in the future. After each successful delta lease renewal, the expiry time is renewed for the associated wdmd connection. If wdmd finds any connection expired, it will not renew the /dev/watchdog timer. Given enough successive failed renewals, the watchdog device will fire and reset the host. (Given the multiplexing nature of wdmd, shorter overlapping renewal failures from multiple lockspaces could cause spurious watchdog firing.) The direct link between delta lease renewals and watchdog renewals provides a predictable watchdog firing time based on delta lease renewal timestamps that are visible from other hosts. sanlock knows the time the watchdog on another host has fired based on the delta lease time. Furthermore, if the watchdog device on another host fails to fire when it should, the continuation of delta lease renewals from the other host will make this evident and prevent leases from being taken from the failed host. If sanlock is able to stop/kill all processing using an expiring lockspace, the associated wdmd connection for that lockspace is removed. The expired wdmd connection will no longer block /dev/watchdog renewals, and the host should avoid being reset. .I Storage The sector size and the align size should be specified when creating lockspaces and resources (and rindex). The "align size" is the size on disk of a lockspace or a resource, i.e. the amount of disk space it uses. Lockspaces and resources should use matching sector and align sizes, and must use offsets in multiples of the align size. The max number of hosts that can use a lockspace or resource depends on the combination of sector size and align size, shown below. The host_id of hosts using the lockspace can be no larger than the max_hosts value for the lockspace. Accepted combinations of sector size and align size, and the corresponding max_hosts (and max host_id) are: sector_size 512, align_size 1M, max_hosts 2000 .br sector_size 4096, align_size 1M, max_hosts 250 .br sector_size 4096, align_size 2M, max_hosts 500 .br sector_size 4096, align_size 4M, max_hosts 1000 .br sector_size 4096, align_size 8M, max_hosts 2000 When sector_size and align_size are not specified, the behavior matches the behavior before these sizes could be configured: on devices which report sector size 512, 512/1M/2000 is used, on devices which report sector size 4096, 4096/8M/2000 is used, and on files, 512/1M/2000 is always used. (Other combinations are not compatible with sanlock version 3.6 or earlier.) Using sanlock on shared block devices that do host based mirroring or replication is not likely to work correctly. When using sanlock on shared files, all sanlock io should go to one file server. .I Example This is an example of creating and using lockspaces and resources from the command line. (Most applications would use sanlock through libsanlock rather than through the command line.) .IP 1. 4 Allocate shared storage for sanlock leases. This example assumes 512 byte sectors on the device, in which case the lockspace needs 1MB and each resource needs 1MB. The example shared block device accessible to all hosts is /dev/leases. .IP 2. 4 Start sanlock on all hosts. The -w 0 disables use of the watchdog for testing. .nf # sanlock daemon -w 0 .fi .IP 3. 4 Start a dummy application on all hosts. This sanlock command registers with sanlock, then execs the sleep command which inherits the registered fd. The sleep process acts as the dummy application. Because the sleep process is registered with sanlock, leases can be acquired for it. .nf # sanlock client command -c /bin/sleep 600 & .fi .IP 4. 4 Create a lockspace for the application (from one host). The lockspace is named "test". .nf # sanlock client init -s test:0:/dev/leases:0 .fi .IP 5. 4 Join the lockspace for the application. Use a unique host_id on each host. .nf host1: # sanlock client add_lockspace -s test:1:/dev/leases:0 host2: # sanlock client add_lockspace -s test:2:/dev/leases:0 .fi .IP 6. 4 Create two resources for the application (from one host). The resources are named "RA" and "RB". Offsets are used on the same device as the lockspace. Different LVs or files could also be used. .nf # sanlock client init -r test:RA:/dev/leases:1048576 # sanlock client init -r test:RB:/dev/leases:2097152 .fi .IP 7. 4 Acquire resource leases for the application on host1. Acquire an exclusive lease (the default) on the first resource, and a shared lease (SH) on the second resource. .nf # export P=`pidof sleep` # sanlock client acquire -r test:RA:/dev/leases:1048576 -p $P # sanlock client acquire -r test:RB:/dev/leases:2097152:SH -p $P .fi .IP 8. 4 Acquire resource leases for the application on host2. Acquiring the exclusive lease on the first resource will fail because it is held by host1. Acquiring the shared lease on the second resource will succeed. .nf # export P=`pidof sleep` # sanlock client acquire -r test:RA:/dev/leases:1048576 -p $P # sanlock client acquire -r test:RB:/dev/leases:2097152:SH -p $P .fi .IP 9. 4 Release resource leases for the application on both hosts. The sleep pid could also be killed, which will result in the sanlock daemon releasing its leases when it exits. .nf # sanlock client release -r test:RA:/dev/leases:1048576 -p $P # sanlock client release -r test:RB:/dev/leases:2097152 -p $P .fi .IP 10. 4 Leave the lockspace for the application. .nf host1: # sanlock client rem_lockspace -s test:1:/dev/leases:0 host2: # sanlock client rem_lockspace -s test:2:/dev/leases:0 .fi .IP 11. 4 Stop sanlock on all hosts. .nf # sanlock shutdown .fi .SH OPTIONS .P COMMAND can be one of three primary top level choices .P .BR "sanlock daemon" " start daemon" .br .BR "sanlock client" " send request to daemon (default command if none given)" .br .BR "sanlock direct" " access storage directly (no coordination with daemon)" .SS Daemon Command .BR "sanlock daemon" " [options]" .BR -D " " no fork and print all logging to stderr .BR -Q " 0|1" quiet error messages for common lock contention .BR -R " 0|1" renewal debugging, log debug info for each renewal .BI -L " pri" write logging at priority level and up to logfile (-1 none) .BI -S " pri" write logging at priority level and up to syslog (-1 none) .BI -U " uid" user id .BI -G " gid" group id .BI -H " num" renewal history size .BI -t " num" max worker threads .BI -g " sec" seconds for graceful recovery .BR -w " 0|1" use watchdog through wdmd .BR -h " 0|1" use high priority (RR) scheduling .BI -l " num" use mlockall (0 none, 1 current, 2 current and future) .BI -b " sec" seconds a host id bit will remain set in delta lease bitmap .BI -e " str" local host name used in delta leases ./" non-aio is untested and may not work ./" .BR \-a " 0|1" ./" use async i/o .SS Client Command .B "sanlock client" .I action [options] .B sanlock client status Print processes, lockspaces, and resources being managed by the sanlock daemon. Add -D to show extra internal daemon status for debugging. Add -o p to show resources by pid, or -o s to show resources by lockspace. .B sanlock client host_status Print state of host_id delta leases read during the last renewal. State of all lockspaces is shown (use -s to select one). Add -D to show extra internal daemon status for debugging. .B sanlock client gets Print lockspaces being managed by the sanlock daemon. The LOCKSPACE string will be followed by ADD or REM if the lockspace is currently being added or removed. Add -h 1 to also show hosts in each lockspace. .BR "sanlock client renewal -s" " LOCKSPACE" Print a history of renewals with timing details. See the Renewal history section below. .B sanlock client log_dump Print the sanlock daemon internal debug log. .B sanlock client shutdown Ask the sanlock daemon to exit. Without the force option (-f 0), the command will be ignored if any lockspaces exist. With the force option (-f 1), any registered processes will be killed, their resource leases released, and lockspaces removed. With the wait option (-w 1), the command will wait for a result from the daemon indicating that it has shut down and is exiting, or cannot shut down because lockspaces exist (command fails). .BR "sanlock client init -s" " LOCKSPACE" Tell the sanlock daemon to initialize a lockspace on disk. The -o option can be used to specify the io timeout to be written in the host_id leases. The -Z and -A options can be used to specify the sector size and align size, and both should be set together. (Also see sanlock direct init.) .BR "sanlock client init -r" " RESOURCE" Tell the sanlock daemon to initialize a resource lease on disk. The -Z and -A options can be used to specify the sector size and align size, and both should be set together. (Also see sanlock direct init.) .BR "sanlock client read -s" " LOCKSPACE" Tell the sanlock daemon to read a lockspace from disk. Only the LOCKSPACE path and offset are required. If host_id is zero, the first record at offset (host_id 1) is used. The complete LOCKSPACE is printed. Add -D to print other details. (Also see sanlock direct read_leader.) .BR "sanlock client read -r" " RESOURCE" Tell the sanlock daemon to read a resource lease from disk. Only the RESOURCE path and offset are required. The complete RESOURCE is printed. Add -D to print other details. (Also see sanlock direct read_leader.) .BR "sanlock client add_lockspace -s" " LOCKSPACE" Tell the sanlock daemon to acquire the specified host_id in the lockspace. This will allow resources to be acquired in the lockspace. The -o option can be used to specify the io timeout of the acquiring host, and will be written in the host_id lease. .BR "sanlock client inq_lockspace -s" " LOCKSPACE" Inquire about the state of the lockspace in the sanlock daemon, whether it is being added or removed, or is joined. .BR "sanlock client rem_lockspace -s" " LOCKSPACE" Tell the sanlock daemon to release the specified host_id in the lockspace. Any processes holding resource leases in this lockspace will be killed, and the resource leases not released. .BR "sanlock client command -r" " RESOURCE " \ \fB-c\fP " " \fIpath\fP " " \fIargs\fP Register with the sanlock daemon, acquire the specified resource lease, and exec the command at path with args. When the command exits, the sanlock daemon will release the lease. -c must be the final option. .BR "sanlock client acquire -r" " RESOURCE " \ \fB-p\fP " " \fIpid\fP .br .BR "sanlock client release -r" " RESOURCE " \ \fB-p\fP " " \fIpid\fP Tell the sanlock daemon to acquire or release the specified resource lease for the given pid. The pid must be registered with the sanlock daemon. acquire can optionally take a versioned RESOURCE string RESOURCE:lver, where lver is the version of the lease that must be acquired, or fail. .BR "sanlock client convert -r" " RESOURCE " \ \fB-p\fP " " \fIpid\fP Tell the sanlock daemon to convert the mode of the specified resource lease for the given pid. If the existing mode is exclusive (default), the mode of the lease can be converted to shared with RESOURCE:SH. If the existing mode is shared, the mode of the lease can be converted to exclusive with RESOURCE (no :SH suffix). .BI "sanlock client inquire -p" " pid" Print the resource leases held the given pid. The format is a versioned RESOURCE string "RESOURCE:lver" where lver is the version of the lease held. .BR "sanlock client request -r" " RESOURCE " \ \fB-f\fP " " \fIforce_mode\fP Request the owner of a resource do something specified by force_mode. A versioned RESOURCE:lver string must be used with a greater version than is presently held. Zero lver and force_mode clears the request. .BR "sanlock client examine -r" " RESOURCE" Examine the request record for the currently held resource lease and carry out the action specified by the requested force_mode. .BR "sanlock client examine -s" " LOCKSPACE" Examine requests for all resource leases currently held in the named lockspace. Only lockspace_name is used from the LOCKSPACE argument. .BR "sanlock client set_event -s" " LOCKSPACE " \ \fB-i\fP " " \fIhost_id\fP " " \ \fB-g\fP " " \fIgen\fP " " \ \fB-e\fP " " \fInum\fP " " \ \fB-d\fP " " \fInum\fP Set an event for another host. When the sanlock daemon next renews its delta lease for the lockspace it will: set the bit for the host_id in its bitmap, and set the generation, event and data values in its own delta lease. An application that has registered for events from this lockspace on the destination host will get the event that has been set when the destination sees the event during its next delta lease renewal. .BR "sanlock client set_config -s" " LOCKSPACE Set a configuration value for a lockspace. Only lockspace_name is used from the LOCKSPACE argument. The USED flag has the same effect on a lockspace as a process holding a resource lease that will not exit. The USED_BY_ORPHANS flag means that an orphan resource lease will have the same effect as the USED. .br \-u 0|1 Set (1) or clear (0) the USED flag. .br \-O 0|1 Set (1) or clear (0) the USED_BY_ORPHANS flag. \fBsanlock client format -x\fP RINDEX Create a resource index on disk. Use -Z and -A to set the sector size and align size to match the lockspace. \fBsanlock client create -x\fP RINDEX \fB-e\fP \fIresource_name\fP Create a new resource lease on disk, using the rindex to find a free offset. \fBsanlock client delete -x\fP RINDEX \fB-e\fP \fIresource_name\fP[:\fIoffset\fP] Delete an existing resource lease on disk. \fBsanlock client lookup -x\fP RINDEX \fB-e\fP \fIresource_name\fP Look up the offset of an existing resource lease by name on disk, using the rindex. With no -e option, lookup returns the next free lease offset. If -e specifes both name and offset, the lookup verifies both are correct. \fBsanlock client update -x\fP RINDEX \fB-e\fP \fIresource_name\fP[:\fIoffset\fP] [\fB-z 0|1\fP] Add (-z 0) or remove (-z 1) an rindex entry on disk. \fBsanlock client rebuild -x\fP RINDEX Rebuild the rindex entries by scanning the disk for resource leases. .SS Direct Command .B "sanlock direct" .I action [options] ./" non-aio is untested and may not work ./" .BR \-a " 0|1" ./" use async i/o .BI -o " sec" io timeout in seconds .BR "sanlock direct init -s" " LOCKSPACE" .br .BR "sanlock direct init -r" " RESOURCE" Initialize storage for a lockspace or resource. Use the -Z and -A flags to specify the sector size and align size. The max hosts that can use the lockspace/resource (and the max possible host_id) is determined by the sector/align size combination. Possible combinations are: 512/1M, 4096/1M, 4096/2M, 4096/4M, 4096/8M. Lockspaces and resources both use the same amount of space (align_size) for each combination. When initializing a lockspace, sanlock initializes delta leases for max_hosts in the given space. When initializing a resource, sanlock initializes a single paxos lease in the space. With -s, the -o option specifies the io timeout to be written in the host_id leases. With -r, the -z 1 option invalidates the resource lease on disk so it cannot be used until reinitialized normally. .BR "sanlock direct read_leader -s" " LOCKSPACE" .br .BR "sanlock direct read_leader -r" " RESOURCE" Read a leader record from disk and print the fields. The leader record is the single sector of a delta lease, or the first sector of a paxos lease. ./" .P ./" .BR "sanlock direct acquire_id -s" " LOCKSPACE" ./" .br ./" .BR "sanlock direct renew_id -s" " LOCKSPACE" ./" .br ./" .BR "sanlock direct release_id -s" " LOCKSPACE" ./" ./" Acquire, renew, or release a host_id directly to disk, independent from ./" the sanlock daemon. Not for general use. This should only be used for ./" testing or for manual recovery in an emergency. ./" ./" .P ./" .BR "sanlock direct acquire -r" " RESOURCE " \ ./" \fB-i\fP " " \fInum\fP " " \fB-g\fP " " \fInum\fP ./" .br ./" .BR "sanlock direct release -r" " RESOURCE " \ ./" \fB-i\fP " " \fInum\fP " " \fB-g\fP " " \fInum\fP ./" ./" Not supported. Not for general use. ./" .BI "sanlock direct dump" " path" \ \fR[\fP\fB:\fP\fIoffset\fP\fR[\fP\fB:\fP\fIsize\fP\fR]]\fP Read disk sectors and print leader records for delta or paxos leases. Add -f 1 to print the request record values for paxos leases, host_ids set in delta lease bitmaps, and rindex entries. \fBsanlock direct format -x\fP RINDEX .br \fBsanlock direct lookup -x\fP RINDEX \fB-e\fP \fIresource_name\fP .br \fBsanlock direct update -x\fP RINDEX \fB-e\fP \fIresource_name\fP[:\fIoffset\fP] [\fB-z 0|1\fP] .br \fBsanlock direct rebuild -x\fP RINDEX Access the resource index on disk without going through the sanlock daemon. This precludes using the internal paxos lease to protect rindex modifications. See client equivalents for descriptions. .SS LOCKSPACE option string .BR \-s " " \fIlockspace_name\fP:\fIhost_id\fP:\fIpath\fP:\fIoffset\fP .P .IR lockspace_name " name of lockspace" .br .IR host_id " local host identifier in lockspace" .br .IR path " path to storage to use for leases" .br .IR offset " offset on path (bytes)" .br .SS RESOURCE option string .BR \-r " " \fIlockspace_name\fP:\fIresource_name\fP:\fIpath\fP:\fIoffset\fP .P .IR lockspace_name " name of lockspace" .br .IR resource_name " name of resource" .br .IR path " path to storage to use leases" .br .IR offset " offset on path (bytes)" .SS RESOURCE option string with suffix .BR \-r " " \fIlockspace_name\fP:\fIresource_name\fP:\fIpath\fP:\fIoffset\fP:\fIlver\fP .P .IR lver " leader version" .BR \-r " " \fIlockspace_name\fP:\fIresource_name\fP:\fIpath\fP:\fIoffset\fP:SH .P SH indicates shared mode .SS RINDEX option string \fB\-x\fP \fIlockspace_name\fP:\fIpath\fP:\fIoffset\fP .P .IR lockspace_name " name of lockspace" .br .IR path " path to storage to use for leases" .br .IR offset " offset on path (bytes) of rindex" .SS Defaults .B sanlock help shows the default values for the options above. .B sanlock version shows the build version. .SH OTHER .SS Request/Examine The first part of making a request for a resource is writing the request record of the resource (the sector following the leader record). To make a successful request: .IP \(bu 2 RESOURCE:lver must be greater than the lver presently held by the other host. This implies the leader record must be read to discover the lver, prior to making a request. .IP \(bu 2 RESOURCE:lver must be greater than or equal to the lver presently written to the request record. Two hosts may write a new request at the same time for the same lver, in which case both would succeed, but the force_mode from the last would win. .IP \(bu 2 The force_mode must be greater than zero. .IP \(bu 2 To unconditionally clear the request record (set both lver and force_mode to 0), make request with RESOURCE:0 and force_mode 0. .P The owner of the requested resource will not know of the request unless it is explicitly told to examine its resources via the "examine" api/command, or otherwise notfied. The second part of making a request is notifying the resource lease owner that it should examine the request records of its resource leases. The notification will cause the lease owner to automatically run the equivalent of "sanlock client examine -s LOCKSPACE" for the lockspace of the requested resource. The notification is made using a bitmap in each host_id delta lease. Each bit represents each of the possible host_ids (1-2000). If host A wants to notify host B to examine its resources, A sets the bit in its own bitmap that corresponds to the host_id of B. When B next renews its delta lease, it reads the delta leases for all hosts and checks each bitmap to see if its own host_id has been set. It finds the bit for its own host_id set in A's bitmap, and examines its resource request records. (The bit remains set in A's bitmap for set_bitmap_seconds.) .I force_mode determines the action the resource lease owner should take: .IP \[bu] 2 FORCE (1): kill the process holding the resource lease. When the process has exited, the resource lease will be released, and can then be acquired by anyone. The kill signal is SIGKILL (or SIGTERM if SIGKILL is restricted.) .IP \[bu] 2 GRACEFUL (2): run the program configured by sanlock_killpath against the process holding the resource lease. If no killpath is defined, then FORCE is used. .P .SS Persistent and orphan resource leases A resource lease can be acquired with the PERSISTENT flag (-P 1). If the process holding the lease exits, the lease will not be released, but kept on an orphan list. Another local process can acquire an orphan lease using the ORPHAN flag (-O 1), or release the orphan lease using the ORPHAN flag (-O 1). All orphan leases can be released by setting the lockspace name (-s lockspace_name) with no resource name. .P .SS Renewal history sanlock saves a limited history of lease renewal information in each lockspace. See sanlock.conf renewal_history_size to set the amount of history or to disable (set to 0). IO times are measured in delta lease renewal (each delta lease renewal includes one read and one write). For each successful renewal, a record is saved that includes: .IP \[bu] 2 the timestamp written in the delta lease by the renewal .IP \[bu] 2 the time in milliseconds taken by the delta lease read .IP \[bu] 2 the time in milliseconds taken by the delta lease write .P Also counted and recorded are the number io timeouts and other io errors that occur between successful renewals. Two consecutive successful renewals would be recorded as: .br .nf timestamp=5332 read_ms=482 write_ms=5525 next_timeouts=0 next_errors=0 timestamp=5353 read_ms=99 write_ms=3161 next_timeouts=0 next_errors=0 .fi Those fields are: .IP \[bu] 2 timestamp is the value written into the delta lease during that renewal. .IP \[bu] 2 read_ms/write_ms are the milliseconds taken for the renewal read/write ios. .IP \[bu] 2 next_timeouts are the number of io timeouts that occured after the renewal recorded on that line, and before the next successful renewal on the following line. .IP \[bu] 2 next_errors are the number of io errors (not timeouts) that occured after renewal recorded on that line, and before the next successful renewal on the following line. .P The command 'sanlock client renewal -s lockspace_name' reports the full history of renewals saved by sanlock, which by default is 180 records, about 1 hour of history when using a 20 second renewal interval for a 10 second io timeout. .SH INTERNALS .SS Disk Format .IP \[bu] 2 This example uses 512 byte sectors. .IP \[bu] 2 Each lockspace is 1MB. It holds 2000 delta_leases, one per sector, supporting up to 2000 hosts. .IP \[bu] 2 Each paxos_lease is 1MB. It is used as a lease for one resource. .IP \[bu] 2 The leader_record structure is used differently by each lease type. .IP \[bu] 2 To display all leader_record fields, see sanlock direct read_leader. .IP \[bu] 2 A lockspace is often followed on disk by the paxos_leases used within that lockspace, but this layout is not required. .IP \[bu] 2 The request_record and host_id bitmap are used for requests/events. .IP \[bu] 2 The mode_block contains the SHARED flag indicating a lease is held in the shared mode. .IP \[bu] 2 In a lockspace, the host using host_id N writes to a single delta_lease in sector N-1. No other hosts write to this sector. All hosts read all lockspace sectors when renewing their own delta_lease, and are able to monitor renewals of all delta_leases. .IP \[bu] 2 In a paxos_lease, each host has a dedicated sector it writes to, containing its own paxos_dblock and mode_block structures. Its sector is based on its host_id; host_id 1 writes to the dblock/mode_block in sector 2 of the paxos_lease. .IP \[bu] 2 The paxos_dblock structures are used by the paxos_lease algorithm, and the result is written to the leader_record. .P .B 0x000000 lockspace foo:0:/path:0 (There is no representation on disk of the lockspace in general, only the sequence of specific delta_leases which collectively represent the lockspace.) .B delta_lease foo:1:/path:0 .nf 0x000 0 leader_record (sector 0, for host_id 1) magic: 0x12212010 space_name: foo resource_name: host uuid/name \.\.\. host_id bitmap (leader_record + 256) .fi .B delta_lease foo:2:/path:0 .nf 0x200 512 leader_record (sector 1, for host_id 2) magic: 0x12212010 space_name: foo resource_name: host uuid/name \.\.\. host_id bitmap (leader_record + 256) .fi .B delta_lease foo:3:/path:0 .nf 0x400 1024 leader_record (sector 2, for host_id 3) magic: 0x12212010 space_name: foo resource_name: host uuid/name \.\.\. host_id bitmap (leader_record + 256) .fi .B delta_lease foo:2000:/path:0 .nf 0xF9E00 leader_record (sector 1999, for host_id 2000) magic: 0x12212010 space_name: foo resource_name: host uuid/name \.\.\. host_id bitmap (leader_record + 256) .fi .B 0x100000 paxos_lease foo:example1:/path:1048576 .nf 0x000 0 leader_record (sector 0) magic: 0x06152010 space_name: foo resource_name: example1 0x200 512 request_record (sector 1) magic: 0x08292011 0x400 1024 paxos_dblock (sector 2, for host_id 1) 0x480 1152 mode_block (paxos_dblock + 128) 0x600 1536 paxos_dblock (sector 3, for host_id 2) 0x680 1664 mode_block (paxos_dblock + 128) 0x800 2048 paxos_dblock (sector 4, for host_id 3) 0x880 2176 mode_block (paxos_dblock + 128) 0xFA200 paxos_dblock (sector 2001, for host_id 2000) 0xFA280 mode_block (paxos_dblock + 128) .fi .B 0x200000 paxos_lease foo:example2:/path:2097152 .nf 0x000 0 leader_record (sector 0) magic: 0x06152010 space_name: foo resource_name: example2 0x200 512 request_record (sector 1) magic: 0x08292011 0x400 1024 paxos_dblock (sector 2, for host_id 1) 0x480 1152 mode_block (paxos_dblock + 128) 0x600 1536 paxos_dblock (sector 3, for host_id 2) 0x680 1664 mode_block (paxos_dblock + 128) 0x800 2048 paxos_dblock (sector 4, for host_id 3) 0x880 2176 mode_block (paxos_dblock + 128) 0xFA200 paxos_dblock (sector 2001, for host_id 2000) 0xFA280 mode_block (paxos_dblock + 128) .fi .SS Lease ownership Not shown in the leader_record structures above are the owner_id, owner_generation and timestamp fields. These are the fields that define the lease owner. The delta_lease at sector N for host_id N+1 has leader_record.owner_id N+1. The leader_record.owner_generation is incremented each time the delta_lease is acquired. When a delta_lease is acquired, the leader_record.timestamp field is set to the time of the host and the leader_record.resource_name is set to the unique name of the host. When the host renews the delta_lease, it writes a new leader_record.timestamp. When a host releases a delta_lease, it writes zero to leader_record.timestamp. When a host acquires a paxos_lease, it uses the host_id/generation value from the delta_lease it holds in the lockspace. It uses this host_id/generation to identify itself in the paxos_dblock when running the paxos algorithm. The result of the algorithm is the winning host_id/generation - the new owner of the paxos_lease. The winning host_id/generation are written to the paxos_lease leader_record.owner_id and leader_record.owner_generation fields and leader_record.timestamp is set. When a host releases a paxos_lease, it sets leader_record.timestamp to 0. When a paxos_lease is free (leader_record.timestamp is 0), multiple hosts may attempt to acquire it. The paxos algorithm, using the paxos_dblock structures, will select only one of the hosts as the new owner, and that owner is written in the leader_record. The paxos_lease will no longer be free (non-zero timestamp). Other hosts will see this and will not attempt to acquire the paxos_lease until it is free again. If a paxos_lease is owned (non-zero timestamp), but the owner has not renewed its delta_lease for a specific length of time, then the owner value in the paxos_lease becomes expired, and other hosts will use the paxos algorithm to acquire the paxos_lease, and set a new owner. .SH FILES /etc/sanlock/sanlock.conf .IP \[bu] 2 quiet_fail = 1 .br See -Q .IP \[bu] 2 debug_renew = 0 .br See -R .IP \[bu] 2 logfile_priority = 4 .br See -L .IP \[bu] 2 logfile_use_utc = 0 .br Use UTC instead of local time in log messages. .IP \[bu] 2 syslog_priority = 3 .br See -S .IP \[bu] 2 names_log_priority = 4 .br Log resource names at this priority level (uses syslog priority numbers). If this is greater than or equal to logfile_priority, each requested resource name and location is recorded in sanlock.log. .IP \[bu] 2 use_watchdog = 1 .br See -w .IP \[bu] 2 high_priority = 1 .br See -h .IP \[bu] 2 mlock_level = 1 .br See -l .IP \[bu] 2 sh_retries = 8 .br The number of times to try acquiring a paxos lease when acquiring a shared lease when the paxos lease is held by another host acquiring a shared lease. .IP \[bu] 2 uname = sanlock .br See -U .IP \[bu] 2 gname = sanlock .br See -G .IP \[bu] 2 our_host_name = .br See -e .IP \[bu] 2 renewal_read_extend_sec = .br If a renewal read i/o times out, wait this many additional seconds for that read to complete at the start of the subsequent renewal attempt. When not configured, sanlock waits for an additional io_timeout seconds for a previous timed out read to complete. .IP \[bu] 2 renewal_history_size = 180 .br See -H .IP \[bu] 2 paxos_debug_all = 0 .br Include all details in the paxos debug logging. .IP \[bu] 2 debug_io = .br Add debug logging for each i/o. "submit" (no quotes) produces debug output at submission time, "complete" produces debug output at completion time, and "submit,complete" (no space) produces both. .IP \[bu] 2 max_sectors_kb = | .br Set to "ignore" (no quotes) to prevent sanlock from checking or changing max_sectors_kb for the lockspace disk when starting a lockspace. Set to "align" (no quotes) to set max_sectors_kb for the lockspace disk to the align size of the lockspace. Set to a number to set a specific number of KB for all lockspace disks. .IP \[bu] 2 debug_clients = 0 .br Enable or disable debug logging for all client connections to the sanlock daemon. .IP \[bu] 2 debug_cmd = +|- .br Enable (+name) or disable (-name) debug logging at the command processing level for specifically named commands, e.g. "debug_cmd = +acquire", or "debug_cmd = -inq_lockspace". Repeat this line for each command name. Use a plus prefix before the name to enable and a minus prefix to disable. By default sanlock disables some command level debugging for commands that are often repetitive and fill the in memory debug buffer. This only affects debug logging, not errors or warnings, and disabling command level debugging for a command does not disable lower level debugging for that command. Special values +all and -all can be used to enable or disable all commands, and can be used before or after other debug_cmd lines. .IP \[bu] 2 write_init_io_timeout = .br The io timeout to use when initializing ondisk lease structures for a lockspace or resource. This timeout is not used as a part of either lease algorithm (as the standard io_timeout is.) .SH SEE ALSO .BR wdmd (8) sanlock-3.8.2/src/sanlock.conf000066400000000000000000000021171371427612200162540ustar00rootroot00000000000000# sanlock config file # # Values set here can be overriden on the command line. # See 'man sanlock' and 'sanlock -h' for more information. # # Example settings: # # quiet_fail = 1 # command line: -Q 0|1 # # debug_renew = 0 # command line: -R 0|1 # # logfile_priority = 4 # command line: -L 4 # # logfile_use_utc = 0 # command line: n/a # # syslog_priority = 3 # command line: -S 3 # # names_log_priority = 4 # command line: n/a # # use_watchdog = 1 # command line: -w 1 # # high_priority = 1 # command line: -h 1 # # mlock_level = 1 # command line: -l 1 # # sh_retries = 8 # command line: n/a # # uname = sanlock # command line: -U # # gname = sanlock # command line: -G # # our_host_name = # command line: -e # # renewal_read_extend_sec = # command line: n/a # # paxos_debug_all = 0 # command line: n/a # # debug_io = # command line: n/a # # max_sectors_kb = # command line: n/a # # debug_clients = 0 # command line: n/a # # debug_cmd = + # debug_cmd = - # ... # command line: n/a # # write_init_io_timeout = # command line: n/a sanlock-3.8.2/src/sanlock.h000066400000000000000000000127371371427612200155670ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_H__ #define __SANLOCK_H__ /* an acquire or release call can specify this many explicit resources in a single call. */ #define SANLK_MAX_RESOURCES 8 /* max resource name length */ #define SANLK_NAME_LEN 48 /* max disk path length, includes terminating \0 byte, and escape chars, i.e. the strlen with esc chars inserted must still be less than 1024. */ #define SANLK_PATH_LEN 1024 /* max length of kill script path and args, includes terminate \0 byte */ #define SANLK_HELPER_PATH_LEN 128 #define SANLK_HELPER_ARGS_LEN 128 /* max disks in a single lease */ #define SANLK_MAX_DISKS 4 /* * max length of a sanlk_resource in string format * :::[::...]: * 48 SANLK_NAME_LEN * + 1 colon * + 48 SANLK_NAME_LEN * + 1 colon * + 4184 (4 MAX_DISKS * (1024 SANLK_PATH_LEN + 1 colon + 20 offset + 1 colon)) * + 20 lver * ------ * 4302 */ #define SANLK_MAX_RES_STR 4400 /* TODO: add more padding to sanlk_disk so we can extend sync_disk later without changing abi */ struct sanlk_disk { char path[SANLK_PATH_LEN]; /* must include terminating \0 */ uint64_t offset; uint32_t pad1; uint32_t pad2; }; /* * PERSISTENT: if the pid holding the resource lease exits, * the lease will not be released, but will be moved to the * orphans list. On disk and from the perspective of other * hosts, nothing changes when a lease is orphaned; it continues * to be held by the host. * * (If persistent shared locks are used on a resource, then * all the locks on that resource should be persistent.) * * A new process can acquire an orphan resource using * the ACQUIRE_ORPHAN flag. This implies that the lockspace * had continued running and the resource not released by the * host between the time the resource became an orphan and was * then transferred to a new process. * * Orphan impact on the lockspace: if the lockspace is stopping * because of rem, or lease failure, the ls config option * USED_BY_ORPHANS will block the release of the lockspace * (like the USED option), if orphans exist for the lockspace. * Without USED_BY_ORPHANS, the lockspace would exit and * leave the orphan resources unchanged (not released) on disk. * The unreleased orphan resources could be acquired by another * host if the lockspace lease is cleanly released. */ #define SANLK_RES_LVER 0x00000001 /* lver field is set */ #define SANLK_RES_NUM_HOSTS 0x00000002 /* data32 field is new num_hosts */ #define SANLK_RES_SHARED 0x00000004 #define SANLK_RES_PERSISTENT 0x00000008 #define SANLK_RES_ALIGN1M 0x00000010 #define SANLK_RES_ALIGN2M 0x00000020 #define SANLK_RES_ALIGN4M 0x00000040 #define SANLK_RES_ALIGN8M 0x00000080 #define SANLK_RES_SECTOR512 0x00000100 #define SANLK_RES_SECTOR4K 0x00000200 struct sanlk_resource { char lockspace_name[SANLK_NAME_LEN]; /* terminating \0 not required */ char name[SANLK_NAME_LEN]; /* terminating \0 not required */ uint64_t lver; /* use with SANLK_RES_LVER */ uint64_t data64; /* per-resource command-specific data */ uint32_t data32; /* per-resource command-specific data */ uint32_t unused; uint32_t flags; /* SANLK_RES_ */ uint32_t num_disks; /* followed by num_disks sanlk_disk structs */ struct sanlk_disk disks[0]; }; /* make these values match the RES equivalent in case of typos */ #define SANLK_RIF_ALIGN1M 0x00000010 #define SANLK_RIF_ALIGN2M 0x00000020 #define SANLK_RIF_ALIGN4M 0x00000040 #define SANLK_RIF_ALIGN8M 0x00000080 #define SANLK_RIF_SECTOR512 0x00000100 #define SANLK_RIF_SECTOR4K 0x00000200 struct sanlk_rindex { uint32_t flags; /* SANLK_RIF_ */ uint32_t max_resources; /* the max res structs that will follow rindex */ uint64_t unused; char lockspace_name[SANLK_NAME_LEN]; /* terminating \0 not required */ struct sanlk_disk disk; /* location of rindex */ }; struct sanlk_rentry { char name[SANLK_NAME_LEN]; /* terminating \0 not required */ uint64_t offset; uint32_t flags; uint32_t unused; }; /* command-specific command options (can include per resource data, but that requires the extra work of segmenting it by resource name) */ struct sanlk_options { char owner_name[SANLK_NAME_LEN]; /* optional user friendly name */ uint32_t flags; uint32_t len; /* followed by len bytes (migration input will use this) */ char str[0]; }; #define SANLK_LSF_ADD 0x00000001 #define SANLK_LSF_REM 0x00000002 /* make these values match the RES equivalent in case of typos */ #define SANLK_LSF_ALIGN1M 0x00000010 #define SANLK_LSF_ALIGN2M 0x00000020 #define SANLK_LSF_ALIGN4M 0x00000040 #define SANLK_LSF_ALIGN8M 0x00000080 #define SANLK_LSF_SECTOR512 0x00000100 #define SANLK_LSF_SECTOR4K 0x00000200 struct sanlk_lockspace { char name[SANLK_NAME_LEN]; uint64_t host_id; uint32_t flags; /* SANLK_LSF_ */ struct sanlk_disk host_id_disk; }; struct sanlk_host { uint64_t host_id; uint64_t generation; uint64_t timestamp; uint32_t io_timeout; uint32_t flags; }; struct sanlk_host_event { uint64_t host_id; uint64_t generation; uint64_t event; uint64_t data; }; size_t sanlock_path_export(char *dst, const char *src, size_t dstlen); size_t sanlock_path_import(char *dst, const char *src, size_t dstlen); const char *sanlock_strerror(int rv); #endif sanlock-3.8.2/src/sanlock_admin.h000066400000000000000000000502351371427612200167320ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_ADMIN_H__ #define __SANLOCK_ADMIN_H__ /* add flags */ #define SANLK_ADD_ASYNC 0x00000001 /* rem flags */ #define SANLK_REM_ASYNC 0x00000001 #define SANLK_REM_UNUSED 0x00000002 /* inq flags */ #define SANLK_INQ_WAIT 0x00000001 /* write flags */ #define SANLK_WRITE_CLEAR 0x00000001 /* subsequent read will return error */ /* host status returned in low byte of sanlk_host.flags by get */ #define SANLK_HOST_UNKNOWN 0x00000001 #define SANLK_HOST_FREE 0x00000002 #define SANLK_HOST_LIVE 0x00000003 #define SANLK_HOST_FAIL 0x00000004 #define SANLK_HOST_DEAD 0x00000005 #define SANLK_HOST_MASK 0x0000000F /* select SANLK_HOST_ from flags */ /* * add_lockspace returns: * 0: the lockspace has been added successfully * -EEXIST: the lockspace already exists * -EINPROGRESS: the lockspace is already in the process of being added * (the in-progress add may or may not succeed) * -EAGAIN: the lockspace is being removed * * The _timeout version takes the io_timeout in seconds. * If 0, the global setting for the daemon will be used. */ int sanlock_add_lockspace(struct sanlk_lockspace *ls, uint32_t flags); int sanlock_add_lockspace_timeout(struct sanlk_lockspace *ls, uint32_t flags, uint32_t io_timeout); /* * inq_lockspace returns: * 0: the lockspace exists and is currently held * -ENOENT: lockspace not found */ int sanlock_inq_lockspace(struct sanlk_lockspace *ls, uint32_t flags); /* * rem_lockspace returns: * 0: the lockspace has been removed successfully * -EINPROGRESS: the lockspace is already in the process of being removed * -ENOENT: lockspace not found * -EBUSY: UNUSED was set and lockspace is being used * * The sanlock daemon will kill any pids using the lockspace when the * lockspace is removed (unless UNUSED is set). */ int sanlock_rem_lockspace(struct sanlk_lockspace *ls, uint32_t flags); /* * get_lockspace returns: * 0: all lockspaces copied out, lss_count set to number * -ENOSPC: sanlock internal buffer ran out of space * (lss_count set to number that would have been copied) * * sanlk_lockspace.flags set to SANLK_LSF_ */ int sanlock_get_lockspaces(struct sanlk_lockspace **lss, int *lss_count, uint32_t flags); /* * When host_id is > 0, returns the sanlk_host info about the * specified host_id. * * When host_id is 0, returns sanlk_host info about all hosts * that have been seen alive. * * host status returned by sanlk_host.flags & SANLK_HOST_MASK: * * UNKNOWN: after adding lockspace, there has not yet been * enough time monitoring other hosts to make an accurate * assessment. * * FREE: delta lease not held * the delta lease timestamp is zero * * LIVE: the host is alive * now - last < other_host_fail_seconds * * FAIL: the host is failing and may be in recovery (killing pids) * now - last > other_host_fail_seconds * * DEAD: the host is dead, its watchdog has fired * now - last > other_host_dead_seconds * * now: local monotonic time * * last: if we have never seen the host's timestamp change, then * last is the local monotime when we first checked it, otherwise * last is the local monotime when we last saw the timestamp change * (which would be some time after it was written by the host.) * * other_host_fail_seconds: based on the host's io_timeout, * the number of seconds after which it would begin recovery * (killing pids) if still alive and unable to renew its lease. * * other_host_dead_seconds: based on the host's io_timeout, * the number of seconds after which its watchdog has fired. */ int sanlock_get_hosts(const char *ls_name, uint64_t host_id, struct sanlk_host **hss, int *hss_count, uint32_t flags); /* * set_config cmd values * * USED: the lockspace is being used by an external entity. * This has the same effect on lockspace recovery as resources * that are not released by a pid that cannot be killed. * If the lockspace fails (the delta lease cannot be renewed), * the lockspace will not exit while this flag is set, which * will lead to the watchdog firing. * * UNUSED: clears the USED flag in the lockspace. * * USED_BY_ORPHANS: the lockspace should be considered USED * if there are orphan resources for the lockspace. * * UNUSED_BY_ORPHANS: clear the USED_BY_ORPHAN flag in the lockspace. */ #define SANLK_CONFIG_USED 0x00000001 #define SANLK_CONFIG_UNUSED 0x00000002 #define SANLK_CONFIG_USED_BY_ORPHANS 0x00000004 #define SANLK_CONFIG_UNUSED_BY_ORPHANS 0x00000008 int sanlock_set_config(const char *ls_name, uint32_t flags, uint32_t cmd, void *data); /* * Returns the alignment in bytes required by sanlock_init() * (1MB for disks with 512 sectors, 8MB for disks with 4096 sectors) */ int sanlock_align(struct sanlk_disk *disk); /* * Ask sanlock daemon to initialize disk space. * Use max_hosts = 0 for default value. * Use num_hosts = 0 for default value. * Provide either lockspace or resource, not both * * (Old api, see write_lockspace/resource) */ int sanlock_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts, int num_hosts); /* * Alignment and sector size * * The ALIGN and SECTOR flags can be set in sanlk_lockspace | sanlk_resource * and passed to sanlock_write_lockspace() | sanlock_write_resource(). * These flags cause sanlock to create a lockspace|resource area with the * given align_size, using the given sector_size. The maximum hosts * that can use a lockspace|resource is determined by the combined effect * of ALIGN and SECTOR flags. The following combinations are allowed: * * ALIGN1M | SECTOR512: max_hosts 2000 * ALIGN1M | SECTOR4K: max_hosts 250 * ALIGN2M | SECTOR4K: max_hosts 500 * ALIGN4M | SECTOR4K: max_hosts 1000 * ALIGN8M | SECTOR4K: max_hosts 2000 * * ALIGN and SECTOR flags must both be set, or neither can be set. When * neither are set, sanlock will: * - detect the sector_size of the disk and use 1M align_size if 512, * and 8M align_size for 4K. * - use 512 sector_size and 8M align_size for files. * * sanlock_read_lockspace() | sanlock_read_resource() will return the * ALIGN and SECTOR flags reflecting the state of the lockspace|resource. * These flags are returned whether or not they were specified in * sanlock_write_lockspace() | sanlock_write_resource(). * (The ALIGN flag can be passed to sanlock_read_lockspace() to avoid * an extra read to discover the sector size.) * * Prior to the addition of ALIGN and SECTOR flags, sanlock will return * neither flag from read. The align_size of the lockspace | resource * can then be determined with sanlock_align(). After the addition of * these flags, sanlock_align() no longer correctly indicates the alignment * of the lockspace | resource. * * With the addition of ALIGN and SECTOR flags, sanlock_align() still * reports the *default* alignment that sanlock will use for disks or * files if ALIGN|SECTOR is not passed to write. * * (A lockspace and its associated resources will typically use the * same align and sector size, but it's conceivable they would not, e.g. * if the were placed on different storage with different sector sizes.) */ /* * write a lockspace to disk * * Set SANLK_LSF_ALIGN and SANLK_LSF_SECTOR in ls.flags to define * the sector size and align size of the lockspace on disk. max_hosts * is derived from these values (the max_hosts areg is not used.) * It is best for resources in the lockspace to use these same * sector/align sizes. * * the sanlock daemon writes max_hosts lockspace leader records to disk * * the lockspace will support up to max_hosts using the lockspace at once * * valid host_id's for this lockspace are 1 to max_hosts. * * the first host_id (1) (the first record at offset) is the last * leader record written, so read_lockspace of host_id 1 will fail * until the entire write_lockspace is complete. */ int sanlock_write_lockspace(struct sanlk_lockspace *ls, int max_hosts, uint32_t flags, uint32_t io_timeout); /* * read one host's lockspace record from disk * * the sanlock daemon reads one lockspace leader record from disk * * the minimum input is path and offset * * if name is specified and does not match the leader record name, * SANLK_LEADER_LOCKSPACE is returned * * if name is not specified, it is filled it with the value from disk * * if host_id is zero, host_id 1 is used (the first record at offset) * * if there is no delta lease magic number found at the host_id location, * SANLK_LEADER_MAGIC is returned * * on success, zero is returned and * io_timeout and the entire sanlk_lockspace struct are written to */ int sanlock_read_lockspace(struct sanlk_lockspace *ls, uint32_t flags, uint32_t *io_timeout); /* * format a resource lease area on disk * * the sanlock daemon writes a resource lease area to disk * * Set flag SANLK_WRITE_CLEAR to cause a subsequent read_resource * to return an error. * * Set SANLK_RES_ALIGN and SANLK_RES_SECTOR in res.flags to define * the sector size and align size of the resource on disk. max_hosts * is derived from these values (the max_hosts arg is not used.) * It is best for the ALIGN and SECTOR flags to match those used * for the resource's lockspace. */ int sanlock_write_resource(struct sanlk_resource *res, int max_hosts, int num_hosts, uint32_t flags); /* * read a resource lease from disk * * the sanlock daemon reads the lease's leader record from disk * * the minimum input is one disk with path and offset * * if lockspace name is specified and does not match the leader record * lockspace name, SANLK_LEADER_LOCKSPACE is returned * * if resource name is specified and does not match the leader record * resource name, SANLK_LEADER_RESOURCE is returned * * if there is no paxos lease magic number found in the leader record, * SANLK_LEADER_MAGIC is returned * * on success, zero is returned and * the entire sanlk_resource struct is written to (res->disks is not changed) */ int sanlock_read_resource(struct sanlk_resource *res, uint32_t flags); /* * read resource lease and its owners from disk * * the sanlock daemon reads the entire lease area from disk, * including the leader record and all per-host dblock/mode_block records * * res.lver is set (from leader record) * res.flags is set to SANLK_RES_SHARED if any shared owners exist (from mode blocks) * host.host_id and host.generation are set for each owner (from leader or mode blocks) * host.timestamp is set for an exclusive owner (from leader record) */ int sanlock_read_resource_owners(struct sanlk_resource *res, uint32_t flags, struct sanlk_host **hss, int *hss_count); /* * Check the condition of a resource based on the state of the * resource's owners. This can be used to check if a resource * is held by hosts that would cause an acquire to fail. * * owners is the list of hosts returned by sanlock_read_resource_owners() * hosts is the list of hosts returned by sanlock_get_hosts() * * (This is a client side operation as does not go to the daemon.) * * For each owner, check its state in hosts: * * - if not found in hosts, then the owner is not running and * would not prevent acquire * * - if found in hosts but the generation does not match, * then the owner host has been restarted since owning the * resource and would not prevent acquire * * - if found in hosts with matching generation, then check * host.flags & MASK: * * - FREE: would not prevent acquire * - DEAD: would not prevent acquire * - LIVE: prevents acquire, test fails * - FAIL: prevents acquire, test fails * - UNKNOWN: might prevent acquire, test fails * * * test_flags returned: * SANLK_TRF_FAIL: state of owners would prevent acquire, test fails */ #define SANLK_TRF_FAIL 0x00000001 int sanlock_test_resource_owners(struct sanlk_resource *res, uint32_t flags, struct sanlk_host *owners, int owners_count, struct sanlk_host *hosts, int hosts_count, uint32_t *test_flags); /* * A resource index stores the disk locations (offsets) of resource leases. * Using it is optional; an application can keep track of lease offsets * without using the index. * * On disk, a resource index uses two alignment-sized regions. * The first holds the records mapping resource names to offsets. * The second holds a paxos lease that sanlock uses internally * to protect updates to the index. The caller chooses the disk * location of the resource index (path and offset), and passes * this as a parameter to all functions that use the index with * struct sanlk_rindex. * * The resource index is followed on disk by the resource leases * that it references. So, using the index removes the ability of * the application to place resource leases at any disk location. * A caller would usually place the resource index after a lockspace * struct on disk (not required.) * * The resource index and the following resource leases must all use * the same align size/flag. * * The rindex specifies the lockspace name that the referenced resource * leases are associated with. This lockspace will also be used for * the internal rindex paxos lease. * sanlock must be a member of the lockspace to use the create/delete * resource functions. * * format * ------ * Initializes resource index at the specified offset and * initializes an internal paxos lease in the following area. * Set the ALIGN flag in sanlk_rindex corresponding to the desired * sector size; the align size used for the rindex must match the * align size used for resources. * * lookup * ------ * Looks up a value in the resource index. When a res name is set * and *offset is 0, this searches for an entry with the matching * name and if found sets the res lease offset. When res name is not * set and an *offset is not 0, this checks for an entry with the given * res lease offset and if found sets the res name. When name and * offset or both unset, the first free entry is returned in offset. * All resource lease offsets are relative to the start of the device. * sanlock does not acquire the internal rindex paxos lease. * (The offsets are the disk locations of the resource leases, not * the disk locations of the rindex entries for the resource leases.) * * update * ------ * Add or remove an rindex entry. When adding, the rentry * name and offset must both be set, and the index entry is * set to indicate the named resource lease exists at the * specified offset. WHen removing, the rentry offset needs * to be set, and the index entry for that offset is cleared. * This is not generally used; the create/delete interfaces are * the standard method for updating the index. * * create_resource * --------------- * Searches the index for a free resource lease area, initializes a new * resource lease at that offset, and updates the index for * the new lease. Returns the offset of the new resource lease. * sanlock holds the internal rindex paxos lease around the index * lookup, resource init and index update. The new lease is initialized * before the index is updated, so the index will not reference * an uninitialized area if the host fails during create_resource. * * delete_resource * --------------- * Updates the index to remove the entry for the named resource lease, * and clears the resource lease at that offset. * sanlock holds the internal rindex paxos lease around the * index update and lease reinitialization. If sanlock fails * after the index update but before clearing the resource, a * subsequent create will overwrite the uncleared resource. * * rebuild * ------- * Rebuilds the rindex based on resource leases that are found. * Reads each potential resource lease area to check if a * resource lease exists at that offset. If so, an rindex * entry is added with that resource name and offset. */ /* * generic rindex flags use lower 16 bits * specific rindex function function use upper 16 bits */ #define SANLK_RX_NO_LOCKSPACE 0x000000001 /* don't use the lockspace */ /* update_rindex flags */ #define SANLK_RXUP_ADD 0x00010000 #define SANLK_RXUP_REM 0x00020000 int sanlock_format_rindex(struct sanlk_rindex *rx, uint32_t flags); int sanlock_update_rindex(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re); int sanlock_lookup_rindex(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re); int sanlock_rebuild_rindex(struct sanlk_rindex *rx, uint32_t flags); int sanlock_create_resource(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re, int max_hosts, int num_hosts); int sanlock_delete_resource(struct sanlk_rindex *rx, uint32_t flags, struct sanlk_rentry *re); int sanlock_version(uint32_t flags, uint32_t *version, uint32_t *proto); /* * Lockspace host events * * reg: register with the sanlock daemon, returns a fd to use in poll(2). * end: unregister and close our fd in the sanlock daemon. * set: set/write an event for another host, in the next ls lease renewal. * get: get/read an event from another host from the registered fd. * * reg_event * . he arg is unused, can be NULL * . returns -ENOCSI if no more event fds for the ls are available * (MAX_EVENT_FDS 32) * * set_event * . CUR_GENERATION with zero generation in he means that sanlock * will fill in the he generation with the current generation. * . CLEAR_HOSTID will cause sanlock to clear the host_id in its * bitmap in the next renewal, even if the default time for clearing * it has not been reached. generation/event/data are ignored. * . CLEAR_EVENT will cause sanlock to zero the generation/event/data values * in the next renewal. host_id is ignored. * . REPLACE_EVENT will cause sanlock to replace the existing event/data * values when they would otherwise be rejected with -EBUSY due to a * previous set_event. * . ALL_HOSTS causes the bits for all host_ids to be set. * * Multiple set_event calls * . set_event replaces the last event/data values * . set_event replaces the last generation value * . set_event adds the host_id to the notification bitmap, * leaving any host_id bits that are already set. * * This allows the same event/data values to be passed to multiple * host_ids at once, but without using host_id generations; * generation should be set to 0 in the set_event calls. * * To send the same event/data values (A,B) to hosts 1,2,3: * T=10 set_event(1, A, B); * T=10 set_event(2, A, B); * T=10 set_event(3, A, B); * * The A,B values from each call replace those from the previous call, * but with no effect because they are the same. Bits for 1,2,3 will * all be set in the notification bitmap. * * To send different event/data values to different hosts, wait for * set_bitmap_seconds between the two set_event calls: * T=10 set_event(1, A, B); * T=70 set_event(2, C, D); * * The bit for 1 will be cleared from the bitmap by the time that the * bit for 2 is set. C,D replace A,B, but host 1 will have seen A,B * already, or won't be looking for it any longer. * * Sequential set_events with different event/data values, within a short * time span is likely to produce unwanted results, because the new * event/data values replace the previous values before the previous values * have been read: * T=10 set_event(1, A, B); * T=11 set_event(2, C, D); * * In this case, A,B are replaced by C,D, and both hosts 1 and 2 will be * notified of an event. host 1 will see values C,D, and will not get A,B. * * Unless the REPLACE_EVENT flag is used, sanlock will return -EBUSY from * set_event in this case. */ #define SANLK_SETEV_CUR_GENERATION 0x00000001 #define SANLK_SETEV_CLEAR_HOSTID 0x00000002 #define SANLK_SETEV_CLEAR_EVENT 0x00000004 #define SANLK_SETEV_REPLACE_EVENT 0x00000008 #define SANLK_SETEV_ALL_HOSTS 0x00000010 int sanlock_reg_event(const char *ls_name, struct sanlk_host_event *he, uint32_t flags); int sanlock_end_event(int fd, const char *ls_name, uint32_t flags); int sanlock_set_event(const char *ls_name, struct sanlk_host_event *he, uint32_t flags); int sanlock_get_event(int fd, uint32_t flags, struct sanlk_host_event *he, uint64_t *from_host_id, uint64_t *from_generation); #endif sanlock-3.8.2/src/sanlock_direct.h000066400000000000000000000023711371427612200171120ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __SANLOCK_DIRECT_H__ #define __SANLOCK_DIRECT_H__ /* * Use num_hosts = 0 for default value. * Provide either lockspace or resource, not both * * (Old api, see write_lockspace/resource) */ int sanlock_direct_init(struct sanlk_lockspace *ls, struct sanlk_resource *res, int max_hosts_unused, int num_hosts, int use_aio); /* * write a lockspace to disk * (also see sanlock_write_lockspace) */ int sanlock_direct_write_lockspace(struct sanlk_lockspace *ls, int max_hosts_unused, uint32_t flags, uint32_t io_timeout); /* * format a resource lease area on disk * (also see sanlock_write_resource) */ int sanlock_direct_write_resource(struct sanlk_resource *res, int max_hosts_unused, int num_hosts, uint32_t flags); /* * Returns the alignment in bytes required by sanlock_direct_init() * (1MB for disks with 512 sectors, 8MB for disks with 4096 sectors) */ int sanlock_direct_align(struct sanlk_disk *disk); #endif sanlock-3.8.2/src/sanlock_internal.h000066400000000000000000000276741371427612200174710ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __SANLOCK_INTERNAL_H__ #define __SANLOCK_INTERNAL_H__ #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) #ifndef EXTERN #define EXTERN extern #else #undef EXTERN #define EXTERN #endif #include "sanlock.h" #include "sanlock_rv.h" #include "sanlock_resource.h" #include "leader.h" #include "paxos_dblock.h" #include "mode_block.h" #include "rindex_disk.h" #include "list.h" #include "monotime.h" #include "sizeflags.h" #include /* default max number of hosts supported */ #define DEFAULT_MAX_HOSTS 2000 #define LOG_DUMP_SIZE (1024*1024) /* this is just the path to the executable, not full command line */ #define COMMAND_MAX 4096 #define SANLOCK_RUN_DIR "SANLOCK_RUN_DIR" #define DEFAULT_RUN_DIR "/run/sanlock" #define SANLOCK_PRIVILEGED "SANLOCK_PRIVILEGED" #define SANLK_LOG_DIR "/var/log" #define SANLK_LOGFILE_NAME "sanlock.log" #define SANLK_LOCKFILE_NAME "sanlock.pid" #define SANLK_CONF_PATH "/etc/sanlock/sanlock.conf" #define DAEMON_NAME "sanlock" /* for paxos_lease sync_disk + offset: points to 1 leader_record + 1 request_record + MAX_HOSTS paxos_dblock's = 256 blocks = 128KB, ref: lease_item_record */ /* must mirror external sanlk_disk */ struct sync_disk { char path[SANLK_PATH_LEN]; uint64_t offset; uint32_t sector_size; /* sanlk_disk pad1 */ int fd; /* sanlk_disk pad2 */ }; struct delta_extra { uint64_t field1; uint64_t field2; uint64_t field3; }; /* * There are two different wrappers around a sanlk_resource: * 'struct token' keeps track of resources per-client, client.tokens[] * 'struct resource' keeps track of resources globally, resources list */ #define T_RESTRICT_SIGKILL 0x00000001 /* inherited from client->restricted */ #define T_RESTRICT_SIGTERM 0x00000002 /* inherited from client->restricted */ #define T_RETRACT_PAXOS 0x00000004 #define T_WRITE_DBLOCK_MBLOCK_SH 0x00000008 /* make paxos layer include mb SHARED with dblock */ #define T_CHECK_EXISTS 0x00000010 /* make paxos layer not error if reading lease finds none */ struct token { /* values copied from acquire res arg */ uint64_t acquire_lver; uint64_t acquire_data64; uint32_t acquire_data32; uint32_t acquire_flags; /* copied from the sp with r.lockspace_name */ uint64_t host_id; uint64_t host_generation; uint32_t space_id; uint32_t io_timeout; /* internal */ struct list_head list; /* resource->tokens */ struct resource *resource; int pid; uint32_t flags; /* be careful to avoid using this from different threads */ uint32_t token_id; uint32_t res_id; int sector_size; int align_size; int space_dead; /* copied from sp->space_dead, set by main thread */ int shared_count; /* set during ballot by paxos_lease_acquire */ char shared_bitmap[HOSTID_BITMAP_SIZE]; /* bit set for host_id with SH */ struct sync_disk *disks; /* shorthand, points to r.disks[0] */ struct sanlk_resource r; /* * sanlk_resource must be the last element of token. * sanlk_resource ends with sanlk_disk disks[0], * and allocating a token allocates N sanlk_disk structs * after the token struct so they follow the sanlk_resource. */ }; #define R_SHARED 0x00000001 #define R_THREAD_EXAMINE 0x00000002 #define R_THREAD_RELEASE 0x00000004 #define R_RESTRICT_SIGKILL 0x00000008 /* inherited from token */ #define R_RESTRICT_SIGTERM 0x00000010 /* inherited from token */ #define R_LVB_WRITE_RELEASE 0x00000020 #define R_UNDO_SHARED 0x00000040 #define R_ERASE_ALL 0x00000080 struct resource { struct list_head list; struct list_head tokens; /* only one token when ex, multiple sh */ uint64_t host_id; uint64_t host_generation; uint32_t io_timeout; int pid; /* copied from token when ex */ int sector_size; int align_size; uint32_t res_id; uint32_t reused; uint32_t flags; uint64_t thread_release_retry; char *lvb; char killpath[SANLK_HELPER_PATH_LEN]; /* copied from client */ char killargs[SANLK_HELPER_ARGS_LEN]; /* copied from client */ struct leader_record leader; /* copy of last leader_record we wrote */ struct paxos_dblock dblock; /* copy of last paxos_dblock we wrote */ struct sanlk_resource r; }; struct lease_status { int corrupt_result; int acquire_last_result; int renewal_last_result; uint64_t acquire_last_attempt; uint64_t acquire_last_success; uint64_t renewal_last_attempt; uint64_t renewal_last_success; uint32_t renewal_read_count; uint32_t renewal_read_check; char *renewal_read_buf; }; struct host_status { uint64_t first_check; /* local monotime */ uint64_t last_check; /* local monotime */ uint64_t last_live; /* local monotime */ uint64_t last_req; /* local monotime */ uint64_t owner_id; uint64_t owner_generation; uint64_t timestamp; /* remote monotime */ uint64_t set_bit_time; uint16_t io_timeout; uint16_t lease_bad; char owner_name[NAME_ID_SIZE]; }; struct renewal_history { uint64_t timestamp; int read_ms; int write_ms; int next_timeouts; int next_errors; }; /* The max number of connections that can get events for a lockspace. */ #define MAX_EVENT_FDS 32 #define SP_EXTERNAL_USED 0x00000001 #define SP_USED_BY_ORPHANS 0x00000002 struct space { struct list_head list; char space_name[NAME_ID_SIZE]; uint32_t space_id; /* used to refer to this space instance in log messages */ uint64_t host_id; uint64_t host_generation; struct sync_disk host_id_disk; uint32_t io_timeout; uint32_t set_bitmap_seconds; uint32_t flags; /* SP_ */ uint32_t used_retries; uint32_t renewal_read_extend_sec; /* defaults to io_timeout */ uint32_t rindex_op; int sector_size; int align_size; int max_hosts; int renew_fail; int space_dead; int killing_pids; int external_remove; int thread_stop; int wd_fd; int event_fds[MAX_EVENT_FDS]; struct sanlk_host_event host_event; uint64_t set_event_time; pthread_t thread; pthread_mutex_t mutex; /* protects lease_status, thread_stop */ struct lease_status lease_status; struct host_status host_status[DEFAULT_MAX_HOSTS]; struct renewal_history *renewal_history; int renewal_history_size; int renewal_history_next; int renewal_history_prev; }; /* Update lockspace_info() to copy any fields from struct space to space_info */ struct space_info { uint32_t space_id; uint32_t io_timeout; uint64_t host_id; uint64_t host_generation; int sector_size; int align_size; int killing_pids; }; #define RX_OP_FORMAT 1 #define RX_OP_CREATE 2 #define RX_OP_DELETE 3 #define RX_OP_LOOKUP 4 #define RX_OP_UPDATE 5 #define RX_OP_REBUILD 6 #define HOSTID_AIO_CB_SIZE 4 #define WORKER_AIO_CB_SIZE 2 #define DIRECT_AIO_CB_SIZE 1 #define RESOURCE_AIO_CB_SIZE 2 #define LIB_AIO_CB_SIZE 1 struct aicb { int used; char *buf; struct iocb iocb; }; struct task { char name[NAME_ID_SIZE+1]; /* for log messages */ unsigned int io_count; /* stats */ unsigned int to_count; /* stats */ int use_aio; int cb_size; char *iobuf; io_context_t aio_ctx; struct aicb *read_iobuf_timeout_aicb; struct aicb *callbacks; }; EXTERN struct task main_task; /* TODO: change used, suspend, need_free, pid_dead to flags */ #define CL_KILLPATH_PID 0x00000001 /* include pid as killpath arg */ #define CL_RUNPATH_SENT 0x00000002 /* a RUNPATH msg has been sent to helper */ struct client { int used; int fd; /* unset is -1 */ int pid; /* unset is -1 */ int cmd_active; int cmd_last; int pid_dead; int suspend; int need_free; int kill_count; int tokens_slots; uint32_t flags; uint32_t restricted; uint64_t kill_last; char owner_name[SANLK_NAME_LEN+1]; char killpath[SANLK_HELPER_PATH_LEN]; char killargs[SANLK_HELPER_ARGS_LEN]; pthread_mutex_t mutex; void *workfn; void *deadfn; struct token **tokens; }; /* * client array is only touched by main_loop, there is no lock for it. * individual cl structs are accessed by worker threads using cl->mutex */ EXTERN struct client *client; #define WATCHDOG_FIRE_TIMEOUT 60 #define DEFAULT_USE_AIO 1 #define DEFAULT_IO_TIMEOUT 10 #define DEFAULT_GRACE_SEC 40 #define DEFAULT_USE_WATCHDOG 1 #define DEFAULT_HIGH_PRIORITY 0 #define DEFAULT_MLOCK_LEVEL 1 /* 1=CURRENT, 2=CURRENT|FUTURE */ #define DEFAULT_SOCKET_UID 0 #define DEFAULT_SOCKET_GID 0 #define DEFAULT_SOCKET_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) #define DEFAULT_MIN_WORKER_THREADS 2 #define DEFAULT_MAX_WORKER_THREADS 8 #define DEFAULT_SH_RETRIES 8 #define DEFAULT_QUIET_FAIL 1 #define DEFAULT_RENEWAL_HISTORY_SIZE 180 /* about 1 hour with 20 sec renewal interval */ #define DEFAULT_WRITE_INIT_IO_TIMEOUT 60 #define DEFAULT_MAX_SECTORS_KB_IGNORE 0 /* don't change it */ #define DEFAULT_MAX_SECTORS_KB_ALIGN 0 /* set it to align size */ #define DEFAULT_MAX_SECTORS_KB_NUM 1024 /* set it to num KB for all lockspaces */ struct command_line { int type; /* COM_ */ int action; /* ACT_ */ int debug; int debug_renew; int debug_clients; int debug_io_submit; int debug_io_complete; int paxos_debug_all; uint64_t debug_cmds; int max_sectors_kb_ignore; int max_sectors_kb_align; int max_sectors_kb_num; int quiet_fail; int wait; int use_watchdog; int high_priority; /* -h */ int get_hosts; /* -h */ int names_log_priority; int mlock_level; int max_worker_threads; int aio_arg; int io_timeout_arg; int write_init_io_timeout; int set_bitmap_seconds; int persistent; int orphan_set; int orphan; int used_set; int used; int all; int clear_arg; int sector_size; int align_size; char *uname; /* -U */ int uid; /* -U */ char *gname; /* -G */ int gid; /* -G */ int pid; /* -p */ char sort_arg; uint64_t host_id; /* -i */ uint64_t host_generation; /* -g */ uint64_t he_event; /* -e */ uint64_t he_data; /* -d */ int num_hosts; /* -n */ int max_hosts; /* -m */ int res_count; int sh_retries; uint32_t force_mode; int renewal_history_size; int renewal_read_extend_sec_set; /* 1 if renewal_read_extend_sec is configured */ uint32_t renewal_read_extend_sec; char our_host_name[SANLK_NAME_LEN+1]; char *file_path; char *dump_path; int rindex_op; struct sanlk_rentry rentry; /* -e */ struct sanlk_rindex rindex; /* -x RINDEX */ struct sanlk_lockspace lockspace; /* -s LOCKSPACE */ struct sanlk_resource *res_args[SANLK_MAX_RESOURCES]; /* -r RESOURCE */ }; EXTERN struct command_line com; uint32_t cmd_str_to_num(const char *str); uint64_t cmd_num_to_debug_flag(uint32_t cmd); int is_cmd_debug(uint32_t cmd); void set_cmd_debug(uint32_t cmd); void clear_cmd_debug(uint32_t cmd); /* command line types and actions */ #define COM_DAEMON 1 #define COM_CLIENT 2 #define COM_DIRECT 3 enum { ACT_STATUS = 1, ACT_HOST_STATUS, ACT_LOG_DUMP, ACT_SHUTDOWN, ACT_ADD_LOCKSPACE, ACT_INQ_LOCKSPACE, ACT_REM_LOCKSPACE, ACT_COMMAND, ACT_ACQUIRE, ACT_RELEASE, ACT_INQUIRE, ACT_CONVERT, ACT_REQUEST, ACT_ACQUIRE_ID, ACT_RELEASE_ID, ACT_RENEW_ID, ACT_DIRECT_INIT, ACT_DUMP, ACT_NEXT_FREE, ACT_READ_LEADER, ACT_CLIENT_INIT, ACT_CLIENT_READ, ACT_CLIENT_ALIGN, ACT_EXAMINE, ACT_GETS, ACT_VERSION, ACT_SET_EVENT, ACT_SET_CONFIG, ACT_WRITE_LEADER, ACT_RENEWAL, ACT_FORMAT, ACT_CREATE, ACT_DELETE, ACT_LOOKUP, ACT_UPDATE, ACT_REBUILD, }; EXTERN int external_shutdown; EXTERN char our_host_name_global[SANLK_NAME_LEN+1]; EXTERN int kill_count_max; EXTERN int kill_grace_seconds; EXTERN int helper_ci; EXTERN int helper_pid; EXTERN int helper_kill_fd; EXTERN int helper_status_fd; EXTERN uint64_t helper_last_status; EXTERN uint32_t helper_full_count; EXTERN int efd; EXTERN struct list_head spaces; EXTERN struct list_head spaces_rem; EXTERN struct list_head spaces_add; EXTERN pthread_mutex_t spaces_mutex; /* major.minor.patch-build (TODO: get build) */ EXTERN uint8_t sanlock_version_major; EXTERN uint8_t sanlock_version_minor; EXTERN uint8_t sanlock_version_patch; EXTERN uint8_t sanlock_version_build; EXTERN uint32_t sanlock_version_combined; #endif sanlock-3.8.2/src/sanlock_resource.h000066400000000000000000000133721371427612200174720ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_RESOURCE_H__ #define __SANLOCK_RESOURCE_H__ /* * sock > -1, pid is ignored: * process creates registered connection and acquires/releases leases on * that connection for itself * * sock == -1, pid is used: * process asks daemon to acquire/release leases for another separately * registered pid */ /* restrict flags */ #define SANLK_RESTRICT_ALL 0x00000001 #define SANLK_RESTRICT_SIGKILL 0x00000002 #define SANLK_RESTRICT_SIGTERM 0x00000004 /* killpath flags */ #define SANLK_KILLPATH_PID 0x00000001 /* * acquire flags * * SANLK_ACQUIRE_LVB * Enable the use of an LVB with the lock. * * SANLK_ACQUIRE_ORPHAN * If the lock already exists as an orphan, * then acquire it. Otherwise, acquire a * new lock as usual. * * SANLK_ACQUIRE_ORPHAN_ONLY * If the lock already exists as an orphan, * then acquire it. Otherwise, do not acquire * a lock at all and return -ENOENT. * * SANLK_ACQUIRE_OWNER_NOWAIT * If the lock cannot be granted immediately * because the owner's lease needs to time out, do * not wait, but return -SANLK_ACQUIRE_OWNED_RETRY. */ #define SANLK_ACQUIRE_LVB 0x00000001 #define SANLK_ACQUIRE_ORPHAN 0x00000002 #define SANLK_ACQUIRE_ORPHAN_ONLY 0x00000004 #define SANLK_ACQUIRE_OWNER_NOWAIT 0x00000008 /* * release flags * * SANLK_REL_ALL * Release all resources held by the client. * The res args are ignored. * * SANLK_REL_RENAME * Rename the resource lease on disk when it * is released. The resource is freed and * renamed in a single disk operation (write * to the leader record.) The first res * arg is the resource to release, and the * second resource arg contains the new name * for the first resource. * * SANLK_REL_ORPHAN * Release orphan resources asynchronously. * Takes a single resource struct. If the * resource name is empty, then all orphans * for the specified lockspace are released. * If the resource name is set, then an * orphan with the matching resource name is * released. */ #define SANLK_REL_ALL 0x00000001 #define SANLK_REL_RENAME 0x00000002 #define SANLK_REL_ORPHAN 0x00000004 /* * convert flags * * SANLK_CONVERT_OWNER_NOWAIT * Same as SANLK_ACQUIRE_OWNER_NOWAIT. */ #define SANLK_CONVERT_OWNER_NOWAIT 0x00000008 /* NB: value must match SANLK_ACQUIRE_OWNER_NOWAIT */ /* * request flags * * SANLK_REQUEST_NEXT_LVER * The caller specifies 0 lver in res, and the daemon * automatically requests the current lver + 1. When * multiple hosts are making requests, this flag can * produce unexpected results, and it would be safer * to read the resource, check that the current owner * is the one being targetted, and use that owner's * lver + 1 as the specifically requested lver. */ #define SANLK_REQUEST_NEXT_LVER 0x00000001 /* * request force_mode * * SANLK_REQ_FORCE (SANLK_REQ_KILL_PID deprecated) * Send SIGKILL to the pid holding the resource * (or SIGTERM if SIGKILL is restricted.) * * SANLK_REQ_GRACEFUL * Run killpath against the pid if it is defined, otherwise * send SIGTERM to the pid (or SIGKILL if SIGTERM is restricted). */ #define SANLK_REQ_FORCE 0x00000001 #define SANLK_REQ_GRACEFUL 0x00000002 /* old name deprecated */ #define SANLK_REQ_KILL_PID SANLK_REQ_FORCE int sanlock_register(void); int sanlock_restrict(int sock, uint32_t flags); int sanlock_killpath(int sock, uint32_t flags, const char *path, char *args); int sanlock_acquire(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[], struct sanlk_options *opt_in); int sanlock_release(int sock, int pid, uint32_t flags, int res_count, struct sanlk_resource *res_args[]); int sanlock_inquire(int sock, int pid, uint32_t flags, int *res_count, char **res_state); int sanlock_convert(int sock, int pid, uint32_t flags, struct sanlk_resource *res); int sanlock_request(uint32_t flags, uint32_t force_mode, struct sanlk_resource *res); int sanlock_examine(uint32_t flags, struct sanlk_lockspace *ls, struct sanlk_resource *res); int sanlock_set_lvb(uint32_t flags, struct sanlk_resource *res, char *lvb, int lvblen); int sanlock_get_lvb(uint32_t flags, struct sanlk_resource *res, char *lvb, int lvblen); /* * Functions to convert between string and struct resource formats. * All allocate space for returned data that the caller must free. */ /* * convert from struct sanlk_resource to string with format: * :::[::...]: */ int sanlock_res_to_str(struct sanlk_resource *res, char **str_ret); /* * convert to struct sanlk_resource from string with format: * :::[::...][:] */ int sanlock_str_to_res(char *str, struct sanlk_resource **res_ret); /* * convert from array of struct sanlk_resource * to state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_res_to_str() comment */ int sanlock_args_to_state(int res_count, struct sanlk_resource *res_args[], char **res_state); /* * convert to array of struct sanlk_resource * from state string with format: * "RESOURCE1 RESOURCE2 RESOURCE3 ..." * RESOURCE format in sanlock_str_to_res() comment */ int sanlock_state_to_args(char *res_state, int *res_count, struct sanlk_resource ***res_args); /* * convert to struct sanlk_lockspace from string with format: * ::: */ int sanlock_str_to_lockspace(char *str, struct sanlk_lockspace *ls); #endif sanlock-3.8.2/src/sanlock_rv.h000066400000000000000000000040741371427612200162710ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_RV_H__ #define __SANLOCK_RV_H__ #define SANLK_OK 1 #define SANLK_NONE 0 /* unused */ #define SANLK_ERROR -201 #define SANLK_AIO_TIMEOUT -202 #define SANLK_WD_ERROR -203 /* run_ballot */ #define SANLK_DBLOCK_READ -210 #define SANLK_DBLOCK_WRITE -211 #define SANLK_DBLOCK_LVER -212 #define SANLK_DBLOCK_MBAL -213 #define SANLK_DBLOCK_CHECKSUM -214 /* verify_leader, leader_read, leader_write (paxos or delta) (when adding to list, check if it should be a corrupt_result()) */ #define SANLK_LEADER_READ -220 #define SANLK_LEADER_WRITE -221 #define SANLK_LEADER_DIFF -222 #define SANLK_LEADER_MAGIC -223 #define SANLK_LEADER_VERSION -224 #define SANLK_LEADER_SECTORSIZE -225 #define SANLK_LEADER_LOCKSPACE -226 #define SANLK_LEADER_RESOURCE -227 #define SANLK_LEADER_NUMHOSTS -228 #define SANLK_LEADER_CHECKSUM -229 #define SANLK_ADDLS_SIZES -230 #define SANLK_ADDLS_INVALID_HOSTID -231 /* paxos_lease_acquire, paxos_lease_release */ #define SANLK_ACQUIRE_LVER -240 #define SANLK_ACQUIRE_LOCKSPACE -241 #define SANLK_ACQUIRE_IDDISK -242 #define SANLK_ACQUIRE_IDLIVE -243 #define SANLK_ACQUIRE_OWNED -244 #define SANLK_ACQUIRE_OTHER -245 #define SANLK_ACQUIRE_SHRETRY -246 #define SANLK_ACQUIRE_OWNED_RETRY -247 #define SANLK_RELEASE_LVER -250 #define SANLK_RELEASE_OWNER -251 /* delta_lease_renew, delta_lease_acquire */ #define SANLK_RENEW_OWNER -260 #define SANLK_RENEW_DIFF -261 #define SANLK_HOSTID_BUSY -262 /* request_token */ #define SANLK_REQUEST_MAGIC -270 #define SANLK_REQUEST_VERSION -271 #define SANLK_REQUEST_OLD -272 #define SANLK_REQUEST_LVER -273 /* rindex ops */ #define SANLK_RINDEX_MAGIC -274 #define SANLK_RINDEX_VERSION -275 #define SANLK_RINDEX_LOCKSPACE -276 #define SANLK_RINDEX_OFFSET -277 #define SANLK_RINDEX_DIFF -278 #endif sanlock-3.8.2/src/sanlock_sock.c000066400000000000000000000013641371427612200165730ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_sock.h" int sanlock_socket_address(const char *dir, struct sockaddr_un *addr) { memset(addr, 0, sizeof(struct sockaddr_un)); addr->sun_family = AF_LOCAL; snprintf(addr->sun_path, sizeof(addr->sun_path) - 1, "%s/%s", dir, SANLK_SOCKET_NAME); return 0; } sanlock-3.8.2/src/sanlock_sock.h000066400000000000000000000050571371427612200166030ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __SANLOCK_SOCK_H__ #define __SANLOCK_SOCK_H__ #define SANLK_SOCKET_NAME "sanlock.sock" #define SM_MAGIC 0x04282010 #define SM_PROTO 0x00000001 #define SM_CB_PROTO 0x00000001 #define MAX_CLIENT_MSG (1024 * 1024) /* TODO: this is random */ enum { SM_CMD_REGISTER = 1, SM_CMD_ADD_LOCKSPACE = 2, SM_CMD_REM_LOCKSPACE = 3, SM_CMD_SHUTDOWN = 4, SM_CMD_STATUS = 5, SM_CMD_LOG_DUMP = 6, SM_CMD_ACQUIRE = 7, SM_CMD_RELEASE = 8, SM_CMD_INQUIRE = 9, SM_CMD_RESTRICT = 10, SM_CMD_REQUEST = 11, SM_CMD_ALIGN = 12, SM_CMD_EXAMINE_LOCKSPACE = 13, SM_CMD_EXAMINE_RESOURCE = 14, SM_CMD_HOST_STATUS = 15, SM_CMD_INQ_LOCKSPACE = 16, SM_CMD_KILLPATH = 17, SM_CMD_WRITE_LOCKSPACE = 18, SM_CMD_WRITE_RESOURCE = 19, SM_CMD_READ_LOCKSPACE = 20, SM_CMD_READ_RESOURCE = 21, SM_CMD_GET_LOCKSPACES = 22, SM_CMD_GET_HOSTS = 23, SM_CMD_READ_RESOURCE_OWNERS = 24, SM_CMD_SET_LVB = 25, SM_CMD_GET_LVB = 26, SM_CMD_CONVERT = 27, SM_CMD_VERSION = 28, SM_CMD_SHUTDOWN_WAIT = 29, SM_CMD_REG_EVENT = 30, SM_CMD_END_EVENT = 31, SM_CMD_SET_EVENT = 32, SM_CMD_SET_CONFIG = 33, SM_CMD_RENEWAL = 34, SM_CMD_FORMAT_RINDEX = 35, SM_CMD_UPDATE_RINDEX = 36, SM_CMD_LOOKUP_RINDEX = 37, SM_CMD_CREATE_RESOURCE = 38, SM_CMD_DELETE_RESOURCE = 39, SM_CMD_REBUILD_RINDEX = 40, }; #define SM_CB_GET_EVENT 1 struct sm_header { uint32_t magic; uint32_t version; uint32_t cmd; /* SM_CMD_ */ uint32_t cmd_flags; uint32_t length; uint32_t seq; uint32_t data; uint32_t data2; }; #define SANLK_STATE_MAXSTR 4096 #define SANLK_STATE_DAEMON 1 #define SANLK_STATE_CLIENT 2 #define SANLK_STATE_LOCKSPACE 3 #define SANLK_STATE_RESOURCE 4 #define SANLK_STATE_HOST 5 #define SANLK_STATE_RENEWAL 6 struct sanlk_state { uint32_t type; /* SANLK_STATE_ */ uint32_t flags; uint32_t data32; /* pid (for client) */ uint64_t data64; char name[SANLK_NAME_LEN]; /* client name or resource name */ uint32_t str_len; char str[0]; /* string of internal state */ }; int sanlock_socket_address(const char *dir, struct sockaddr_un *addr); struct event_cb { struct sm_header h; struct sanlk_host_event he; uint64_t from_host_id; uint64_t from_generation; }; #endif sanlock-3.8.2/src/sizeflags.c000066400000000000000000000210061371427612200161040ustar00rootroot00000000000000/* * Copyright 2018 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "sanlock.h" #include "sizeflags.h" #include "log.h" int size_to_max_hosts(int sector_size, int align_size) { if ((align_size == ALIGN_SIZE_1M) && (sector_size == 512)) { return 2000; } else if ((align_size == ALIGN_SIZE_1M) && (sector_size == 4096)) { return 250; } else if ((align_size == ALIGN_SIZE_2M) && (sector_size == 4096)) { return 500; } else if ((align_size == ALIGN_SIZE_4M) && (sector_size == 4096)) { return 1000; } else if ((align_size == ALIGN_SIZE_8M) && (sector_size == 4096)) { return 2000; } else { return 0; } } /* * In previous versions, 512 always implied 1M, and 4K implied 8M. * We fall back to this if an align size flag is not set in the * leader record. */ int sector_size_to_align_size_old(int sector_size) { if (sector_size == 512) return ALIGN_SIZE_1M; if (sector_size == 4096) return ALIGN_SIZE_8M; return -1; } const char *align_size_debug_str(int align_size) { if (align_size == ALIGN_SIZE_1M) return "1M"; if (align_size == ALIGN_SIZE_2M) return "2M"; if (align_size == ALIGN_SIZE_4M) return "4M"; if (align_size == ALIGN_SIZE_8M) return "8M"; return NULL; } /* * struct leader_record */ uint32_t leader_align_flag_from_size(int align_size) { if (align_size == ALIGN_SIZE_1M) return LFL_ALIGN_1M; if (align_size == ALIGN_SIZE_2M) return LFL_ALIGN_2M; if (align_size == ALIGN_SIZE_4M) return LFL_ALIGN_4M; if (align_size == ALIGN_SIZE_8M) return LFL_ALIGN_8M; log_error("leader_align_flag_from_num unknown %d", align_size); return 0; } int leader_align_size_from_flag(uint32_t flags) { if (flags & LFL_ALIGN_1M) return ALIGN_SIZE_1M; if (flags & LFL_ALIGN_2M) return ALIGN_SIZE_2M; if (flags & LFL_ALIGN_4M) return ALIGN_SIZE_4M; if (flags & LFL_ALIGN_8M) return ALIGN_SIZE_8M; return 0; } /* * struct rindex_header */ uint32_t rindex_header_align_flag_from_size(int align_size) { if (align_size == ALIGN_SIZE_1M) return RHF_ALIGN_1M; if (align_size == ALIGN_SIZE_2M) return RHF_ALIGN_2M; if (align_size == ALIGN_SIZE_4M) return RHF_ALIGN_4M; if (align_size == ALIGN_SIZE_8M) return RHF_ALIGN_8M; log_error("rindex_header_align_flag_from_size unknown %d", align_size); return 0; } int rindex_header_align_size_from_flag(uint32_t flags) { if (flags & RHF_ALIGN_1M) return ALIGN_SIZE_1M; if (flags & RHF_ALIGN_2M) return ALIGN_SIZE_2M; if (flags & RHF_ALIGN_4M) return ALIGN_SIZE_4M; if (flags & RHF_ALIGN_8M) return ALIGN_SIZE_8M; return 0; } /* * struct sanlk_lockspace */ int sanlk_lsf_sector_flag_to_size(uint32_t flags) { if (flags & SANLK_LSF_SECTOR512) return 512; if (flags & SANLK_LSF_SECTOR4K) return 4096; return 0; } uint32_t sanlk_lsf_sector_size_to_flag(int sector_size) { if (sector_size == 512) return SANLK_LSF_SECTOR512; if (sector_size == 4096) return SANLK_LSF_SECTOR4K; log_error("sanlk_lsf_sector_size_to_flag invalid sector size %d", sector_size); return 0; } void sanlk_lsf_sector_flags_clear(uint32_t *flags) { *flags &= ~SANLK_LSF_SECTOR512; *flags &= ~SANLK_LSF_SECTOR4K; } void sanlk_lsf_align_flags_clear(uint32_t *flags) { *flags &= ~SANLK_LSF_ALIGN1M; *flags &= ~SANLK_LSF_ALIGN2M; *flags &= ~SANLK_LSF_ALIGN4M; *flags &= ~SANLK_LSF_ALIGN8M; } int sanlk_lsf_align_flag_to_size(uint32_t flags) { if (flags & SANLK_LSF_ALIGN1M) return ALIGN_SIZE_1M; if (flags & SANLK_LSF_ALIGN2M) return ALIGN_SIZE_2M; if (flags & SANLK_LSF_ALIGN4M) return ALIGN_SIZE_4M; if (flags & SANLK_LSF_ALIGN8M) return ALIGN_SIZE_8M; return 0; } uint32_t sanlk_lsf_align_size_to_flag(int align_size) { if (align_size == ALIGN_SIZE_1M) return SANLK_LSF_ALIGN1M; if (align_size == ALIGN_SIZE_2M) return SANLK_LSF_ALIGN2M; if (align_size == ALIGN_SIZE_4M) return SANLK_LSF_ALIGN4M; if (align_size == ALIGN_SIZE_8M) return SANLK_LSF_ALIGN8M; log_error("sanlk_lsf_align_size_to_flag invalid align size %d", align_size); return 0; } /* * struct sanlk_resource */ int sanlk_res_sector_flag_to_size(uint32_t flags) { if (flags & SANLK_RES_SECTOR512) return 512; if (flags & SANLK_RES_SECTOR4K) return 4096; return 0; } uint32_t sanlk_res_sector_size_to_flag(int sector_size) { if (sector_size == 512) return SANLK_RES_SECTOR512; if (sector_size == 4096) return SANLK_RES_SECTOR4K; log_error("sanlk_res_sector_size_to_flag invalid sector size %d", sector_size); return 0; } void sanlk_res_sector_flags_clear(uint32_t *flags) { *flags &= ~SANLK_RES_SECTOR512; *flags &= ~SANLK_RES_SECTOR4K; } void sanlk_res_align_flags_clear(uint32_t *flags) { *flags &= ~SANLK_RES_ALIGN1M; *flags &= ~SANLK_RES_ALIGN2M; *flags &= ~SANLK_RES_ALIGN4M; *flags &= ~SANLK_RES_ALIGN8M; } int sanlk_res_align_flag_to_size(uint32_t flags) { if (flags & SANLK_RES_ALIGN1M) return ALIGN_SIZE_1M; if (flags & SANLK_RES_ALIGN2M) return ALIGN_SIZE_2M; if (flags & SANLK_RES_ALIGN4M) return ALIGN_SIZE_4M; if (flags & SANLK_RES_ALIGN8M) return ALIGN_SIZE_8M; return 0; } uint32_t sanlk_res_align_size_to_flag(int align_size) { if (align_size == ALIGN_SIZE_1M) return SANLK_RES_ALIGN1M; if (align_size == ALIGN_SIZE_2M) return SANLK_RES_ALIGN2M; if (align_size == ALIGN_SIZE_4M) return SANLK_RES_ALIGN4M; if (align_size == ALIGN_SIZE_8M) return SANLK_RES_ALIGN8M; log_error("sanlk_res_align_size_to_flag invalid align size %d", align_size); return 0; } /* * struct sanlk_rindex */ int sanlk_rif_sector_flag_to_size(uint32_t flags) { if (flags & SANLK_RIF_SECTOR512) return 512; if (flags & SANLK_RIF_SECTOR4K) return 4096; return 0; } uint32_t sanlk_rif_sector_size_to_flag(int sector_size) { if (sector_size == 512) return SANLK_RIF_SECTOR512; if (sector_size == 4096) return SANLK_RIF_SECTOR4K; log_error("sanlk_rif_sector_size_to_flag invalid sector size %d", sector_size); return 0; } int sanlk_rif_align_flag_to_size(uint32_t flags) { if (flags & SANLK_RIF_ALIGN1M) return ALIGN_SIZE_1M; if (flags & SANLK_RIF_ALIGN2M) return ALIGN_SIZE_2M; if (flags & SANLK_RIF_ALIGN4M) return ALIGN_SIZE_4M; if (flags & SANLK_RIF_ALIGN8M) return ALIGN_SIZE_8M; return 0; } uint32_t sanlk_rif_align_size_to_flag(int align_size) { if (align_size == ALIGN_SIZE_1M) return SANLK_RIF_ALIGN1M; if (align_size == ALIGN_SIZE_2M) return SANLK_RIF_ALIGN2M; if (align_size == ALIGN_SIZE_4M) return SANLK_RIF_ALIGN4M; if (align_size == ALIGN_SIZE_8M) return SANLK_RIF_ALIGN8M; log_error("sanlk_rif_align_size_to_flag invalid align size %d", align_size); return 0; } /* * Translate struct flags passed from libsanlock to numbers. */ int sizes_from_flags(uint32_t flags, int *sector_size, int *align_size, int *max_hosts, const char *kind) { int no_align_flag = 0; int no_sector_flag = 0; *sector_size = 0; *align_size = 0; *max_hosts = 0; /* SANLK_RES flags in sanlk_resource.flags */ if (!strcmp(kind, "RES")) { *align_size = sanlk_res_align_flag_to_size(flags); if (!*align_size) no_align_flag = 1; *sector_size = sanlk_res_sector_flag_to_size(flags); if (!*sector_size) no_sector_flag = 1; } /* SANLK_LSF flags in sanlk_lockspace.flags */ else if (!strcmp(kind, "LSF")) { *align_size = sanlk_lsf_align_flag_to_size(flags); if (!*align_size) no_align_flag = 1; *sector_size = sanlk_lsf_sector_flag_to_size(flags); if (!*sector_size) no_sector_flag = 1; } /* SANLK_RIF flags in sanlk_rindex.flags */ else if (!strcmp(kind, "RIF")) { *align_size = sanlk_rif_align_flag_to_size(flags); if (!*align_size) no_align_flag = 1; *sector_size = sanlk_rif_sector_flag_to_size(flags); if (!*sector_size) no_sector_flag = 1; } else { log_error("unknown kind %s of flags %x", kind, flags); return -1; } if (no_sector_flag != no_align_flag) { log_error("ALIGN and SECTOR flags %s %x must both be set", kind, flags); return -1; } if (!*sector_size) return 0; *max_hosts = size_to_max_hosts(*sector_size, *align_size); if (!*max_hosts) { log_error("Invalid combination of ALIGN and SECTOR flags %s %x", kind, flags); return -1; } return 0; } sanlock-3.8.2/src/sizeflags.h000066400000000000000000000034211371427612200161120ustar00rootroot00000000000000/* * Copyright 2018 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __SIZES_H__ #define __SIZES_H__ #define ALIGN_SIZE_1M 1048576 #define ALIGN_SIZE_2M (2 * ALIGN_SIZE_1M) #define ALIGN_SIZE_4M (4 * ALIGN_SIZE_1M) #define ALIGN_SIZE_8M (8 * ALIGN_SIZE_1M) int size_to_max_hosts(int sector_size, int align_size); int sector_size_to_align_size_old(int sector_size); const char *align_size_debug_str(int align_size); uint32_t leader_align_flag_from_size(int align_size); int leader_align_size_from_flag(uint32_t flags); uint32_t rindex_header_align_flag_from_size(int align_size); int rindex_header_align_size_from_flag(uint32_t flags); int sanlk_lsf_sector_flag_to_size(uint32_t flags); uint32_t sanlk_lsf_sector_size_to_flag(int sector_size); int sanlk_lsf_align_flag_to_size(uint32_t flags); uint32_t sanlk_lsf_align_size_to_flag(int align_size); void sanlk_lsf_sector_flags_clear(uint32_t *flags); void sanlk_lsf_align_flags_clear(uint32_t *flags); int sanlk_res_sector_flag_to_size(uint32_t flags); uint32_t sanlk_res_sector_size_to_flag(int sector_size); int sanlk_res_align_flag_to_size(uint32_t flags); uint32_t sanlk_res_align_size_to_flag(int align_size); void sanlk_res_sector_flags_clear(uint32_t *flags); void sanlk_res_align_flags_clear(uint32_t *flags); int sanlk_rif_sector_flag_to_size(uint32_t flags); uint32_t sanlk_rif_sector_size_to_flag(int sector_size); int sanlk_rif_align_flag_to_size(uint32_t flags); uint32_t sanlk_rif_align_size_to_flag(int align_size); int sizes_from_flags(uint32_t flags, int *sector_size, int *align_size, int *max_hosts, const char *kind); #endif sanlock-3.8.2/src/task.c000066400000000000000000000060001371427612200150540ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #include "task.h" void setup_task_aio(struct task *task, int use_aio, int cb_size) { int rv; task->use_aio = use_aio; memset(&task->aio_ctx, 0, sizeof(task->aio_ctx)); /* main task doesn't actually do disk io so it passes in, * cb_size 0, but it still wants use_aio set for other * tasks to copy */ if (!use_aio) return; if (!cb_size) return; rv = io_setup(cb_size, &task->aio_ctx); if (rv < 0) goto fail; task->cb_size = cb_size; task->callbacks = malloc(cb_size * sizeof(struct aicb)); if (!task->callbacks) { rv = -ENOMEM; goto fail_setup; } memset(task->callbacks, 0, cb_size * sizeof(struct aicb)); return; fail_setup: io_destroy(task->aio_ctx); fail: task->use_aio = 0; } void close_task_aio(struct task *task) { struct timespec ts; struct io_event event; uint64_t last_warn; uint64_t begin; uint64_t now; int rv, i, used, lvl; if (!task->use_aio) goto skip_aio; memset(&ts, 0, sizeof(struct timespec)); ts.tv_sec = DEFAULT_IO_TIMEOUT; last_warn = time(NULL); begin = last_warn; /* wait for all outstanding aio to complete before destroying aio context, freeing iocb and buffers */ while (1) { now = time(NULL); if (now - last_warn >= (DEFAULT_IO_TIMEOUT * 6)) { last_warn = now; lvl = LOG_ERR; } else { lvl = LOG_DEBUG; } used = 0; for (i = 0; i < task->cb_size; i++) { if (!task->callbacks[i].used) continue; used++; log_level(0, 0, task->name, lvl, "close_task_aio %d %p busy", i, &task->callbacks[i]); } if (!used) break; if (now - begin >= 120) break; memset(&event, 0, sizeof(event)); rv = io_getevents(task->aio_ctx, 1, 1, &event, &ts); if (rv == -EINTR) continue; if (rv < 0) break; if (rv == 1) { struct iocb *ev_iocb = event.obj; struct aicb *ev_aicb = container_of(ev_iocb, struct aicb, iocb); if (ev_aicb->buf == task->iobuf) task->iobuf = NULL; log_taske(task, "aio collect %p:%p:%p result %ld:%ld close free", ev_aicb, ev_iocb, ev_aicb->buf, event.res, event.res2); ev_aicb->used = 0; free(ev_aicb->buf); ev_aicb->buf = NULL; } } if (used) log_taskd(task, "close_task_aio destroy %d incomplete ops", used); io_destroy(task->aio_ctx); if (used) log_taske(task, "close_task_aio destroyed %d incomplete ops", used); if (task->iobuf) free(task->iobuf); skip_aio: if (task->callbacks) free(task->callbacks); task->callbacks = NULL; } sanlock-3.8.2/src/task.h000066400000000000000000000006511371427612200150670ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __TASK_H__ #define __TASK_H__ void setup_task_aio(struct task *task, int use_aio, int cb_size); void close_task_aio(struct task *task); #endif sanlock-3.8.2/src/timeouts.c000066400000000000000000000067351371427612200160020ustar00rootroot00000000000000/* * Copyright 2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #include "task.h" #include "timeouts.h" int calc_host_dead_seconds(int io_timeout) { /* id_renewal_fail_seconds + WATCHDOG_FIRE_TIMEOUT */ return (8 * io_timeout) + WATCHDOG_FIRE_TIMEOUT; } int calc_id_renewal_seconds(int io_timeout) { return 2 * io_timeout; } int calc_id_renewal_fail_seconds(int io_timeout) { return 8 * io_timeout; } int calc_id_renewal_warn_seconds(int io_timeout) { return 6 * io_timeout; } int calc_set_bitmap_seconds(int io_timeout) { if (com.set_bitmap_seconds) return com.set_bitmap_seconds; /* 3 * id_renewal_seconds, somewhat random choice */ return 6 * io_timeout; } void log_timeouts(int io_timeout_arg) { int io_timeout_seconds = io_timeout_arg; int id_renewal_seconds = 2 * io_timeout_seconds; int id_renewal_fail_seconds = 8 * io_timeout_seconds; int id_renewal_warn_seconds = 6 * io_timeout_seconds; /* those above are chosen by us, the rest are based on them */ int host_dead_seconds = id_renewal_fail_seconds + WATCHDOG_FIRE_TIMEOUT; int delta_large_delay = id_renewal_seconds + (6 * io_timeout_seconds); int delta_short_delay = 2 * io_timeout_seconds; int max = host_dead_seconds; if (delta_large_delay > max) max = delta_large_delay; int delta_acquire_held_max = max + delta_short_delay + (4 * io_timeout_seconds); int delta_acquire_held_min = max; int delta_acquire_free_max = delta_short_delay + (3 * io_timeout_seconds); int delta_acquire_free_min = delta_short_delay; int delta_renew_max = 2 * io_timeout_seconds; int delta_renew_min = 0; int paxos_acquire_held_max = host_dead_seconds + (7 * io_timeout_seconds); int paxos_acquire_held_min = host_dead_seconds; int paxos_acquire_free_max = 6 * io_timeout_seconds; int paxos_acquire_free_min = 0; int request_finish_seconds = 3 * id_renewal_seconds; /* random */ log_debug("io_timeout_seconds %d", io_timeout_seconds); log_debug("id_renewal_seconds %d", id_renewal_seconds); log_debug("id_renewal_fail_seconds %d", id_renewal_fail_seconds); log_debug("id_renewal_warn_seconds %d", id_renewal_warn_seconds); log_debug("host_dead_seconds %d", host_dead_seconds); log_debug("delta_large_delay %d", delta_large_delay); log_debug("delta_short_delay %d", delta_short_delay); log_debug("delta_acquire_held_max %d", delta_acquire_held_max); log_debug("delta_acquire_held_min %d", delta_acquire_held_min); log_debug("delta_acquire_free_max %d", delta_acquire_free_max); log_debug("delta_acquire_free_min %d", delta_acquire_free_min); log_debug("delta_renew_max %d", delta_renew_max); log_debug("delta_renew_min %d", delta_renew_min); log_debug("paxos_acquire_held_max %d", paxos_acquire_held_max); log_debug("paxos_acquire_held_min %d", paxos_acquire_held_min); log_debug("paxos_acquire_free_max %d", paxos_acquire_free_max); log_debug("paxos_acquire_free_min %d", paxos_acquire_free_min); log_debug("request_finish_seconds %d", request_finish_seconds); } sanlock-3.8.2/src/timeouts.h000066400000000000000000000462471371427612200160110ustar00rootroot00000000000000/* * Copyright 2010-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ /* * Example of watchdog behavior when host_id renewals fail, assuming * that sanlock cannot successfully kill the pids it is supervising that * depend on the given host_id. * * * Using these values in the example * wdmd test interval = 10 (defined in wdmd/main.c) * watchdog_fire_timeout = 60 (constant) * io_timeout_seconds = 10 (defined by us) * id_renewal_seconds = 20 (= delta_renew_max = 2 * io_timeout_seconds) * id_renewal_fail_seconds = 80 (= 4 * delta_renew_max = 8 * io_timeout_seconds) * host_dead_seconds = 140 (id_renewal_fail_seconds + watchdog_fire_timeout) * * T time in seconds * * 0: sanlock renews host_id on disk * sanlock calls wdmd_test_live(0, 80) [0 + 80] * wdmd test_client sees now 0 < expire 80 ok -> keepalive * * 10: wdmd test_client sees now 10 < expire 80 ok -> keepalive * * 20: sanlock renews host_id on disk ok * sanlock calls wdmd_test_live(20, 100) [20 + 80] * wdmd test_client sees now 20 < expire 100 or 80 ok -> keepalive * * 30: wdmd test_client sees now 30 < expire 100 ok -> keepalive * * 40: sanlock renews host_id on disk ok * sanlock calls wdmd_test_live(40, 120) [40 + 80] * wdmd test_client sees now 40 < expire 120 or 100 ok -> keepalive * * 50: wdmd test_client sees now 50 < expire 120 ok -> keepalive * * all normal until 59 * --------------------------------------------------------- * problems begin at 60 * * 60: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 60 < expire 120 ok -> keepalive * * 70: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 70 < expire 120 ok -> keepalive * * 80: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 80 < expire 120 ok -> keepalive * * 90: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 90 < expire 120 ok -> keepalive * * 100: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 100 < expire 120 ok -> keepalive * messages: check_our_lease warning (sanlock) * * 110: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 110 < expire 120 ok -> keepalive (from dev close) * messages: watchdog closed unclean (wdmd), test warning (wdmd) * * 120: sanlock fails to renew host_id on disk -> no wdmd_test_live * sanlock enters recovery mode and starts killing pids because we have reached * now (120) is id_renewal_fail_seconds (80) after last renewal (40) * wdmd test_client sees now 120 >= expire 120 fail -> no keepalive * messages: check_our_lease failed (sanlock), test failed (wdmd) * * . /dev/watchdog will fire at last keepalive + watchdog_fire_timeout = * T110 + 60 = T170 * . host_id will expire at * last disk renewal ok + id_renewal_fail_seconds + watchdog_fire_timeout * T40 + 80 + 60 = T180 * (aka last disk renewal ok + host_dead_seconds, T40 + 140 = T180) * . the wdmd test at T110 could have been at T119, so wdmd would have * seen the client unexpired/ok and done keepalive at 119 just before the * expiry at 120, which would lead to /dev/watchdog firing at 119+60 = T179 * . so, the watchdog could fire as early as T170 or as late as T179, but * the host_id will not expire until T180 * * 130: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 130 > expire 120 fail -> no keepalive * * 140: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 140 > expire 120 fail -> no keepalive * * 150: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 150 > expire 120 fail -> no keepalive * * 160: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 160 > expire 120 fail -> no keepalive * * 170: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 170 > expire 120 fail -> no keepalive * /dev/watchdog fires because last keepalive was T110, 60 seconds ago * (earliest possible /dev/watchdog firing due to wdmd checking expiry just * after sanlock calls wdmd_test_live at T110 and just after the expiry at T120) * * 179: (latest possible /dev/watchdog firing due to wdmd checking expiry just * before the expiry at T119) * * 180: another host can acquire leases held by host_id. * This is host_dead_seconds (140) after the last successful renewal (T40) */ /* * Example of watchdog behavior when host_id renewals fail, assuming * that sanlock cannot successfully kill the pids it is supervising that * depend on the given host_id. * * * Using these values in the example * wdmd test interval = 10 (defined in wdmd/main.c) * watchdog_fire_timeout = 60 (constant) * io_timeout_seconds = 20 (defined by us) * id_renewal_seconds = 40 (= delta_renew_max = 2 * io_timeout_seconds) * id_renewal_fail_seconds = 160 (= 4 * delta_renew_max = 8 * io_timeout_seconds) * host_dead_seconds = 220 (id_renewal_fail_seconds + watchdog_fire_timeout) * * T time in seconds * * 0: sanlock renews host_id on disk * sanlock calls wdmd_test_live(0, 160) [0 + 160] * wdmd test_client sees now 0 < expire 160 ok -> keepalive * * 10: wdmd test_client sees now < expire 160 ok -> keepalive * 20: wdmd test_client sees now < expire 160 ok -> keepalive * 30: wdmd test_client sees now < expire 160 ok -> keepalive * * 40: sanlock renews host_id on disk ok * sanlock calls wdmd_test_live(40, 200) [40 + 160] * wdmd test_client sees now 40 < expire 200 or 160 ok -> keepalive * * 50: wdmd test_client sees now < expire 200 ok -> keepalive * 60: wdmd test_client sees now < expire 200 ok -> keepalive * 70: wdmd test_client sees now < expire 200 ok -> keepalive * * 80: sanlock renews host_id on disk ok * sanlock calls wdmd_test_live(80, 240) [80 + 160] * wdmd test_client sees now 80 < expire 240 or 200 ok -> keepalive * * 90: wdmd test_client sees now < expire 240 ok -> keepalive * 100: wdmd test_client sees now < expire 240 ok -> keepalive * 110: wdmd test_client sees now < expire 240 ok -> keepalive * * all normal until 119 * --------------------------------------------------------- * problems begin at 120 * * 120: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now 120 < expire 240 ok -> keepalive * * 130: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 140: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 150: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 160: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 170: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 180: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 190: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 200: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 210: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 220: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * 230: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now < expire 240 ok -> keepalive * * 240: sanlock fails to renew host_id on disk -> no wdmd_test_live * sanlock enters recovery mode and starts killing pids * wdmd test_client sees now 240 >= expire 240 fail -> no keepalive * wdmd starts logging error messages every 10 sec * * . /dev/watchdog will fire at last keepalive + watchdog_fire_timeout = * T230 + 60 = T290 * . host_id will expire at * last disk renewal ok + id_renewal_fail_seconds + watchdog_fire_timeout * T80 + 160 + 60 = T300 * (aka last disk renewal ok + host_dead_seconds, T80 + 220 = T300) * . the wdmd test at T230 could have been at T239, so wdmd would have * seen the client unexpired/ok and done keepalive at 239 just before the * expiry at 240, which would lead to /dev/watchdog firing at 239+60 = T299 * . so, the watchdog could fire as early as T290 or as late as T299, but * the host_id will not expire until T300 * * 250: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now > expire 240 fail -> no keepalive * 260: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now > expire 240 fail -> no keepalive * 270: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now > expire 240 fail -> no keepalive * 280: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now > expire 240 fail -> no keepalive * 290: sanlock fails to renew host_id on disk -> no wdmd_test_live * wdmd test_client sees now > expire 240 fail -> no keepalive * /dev/watchdog fires because last keepalive was T230, 60 seconds ago * (earliest possible /dev/watchdog firing due to wdmd checking expiry * just after sanlock calls wdmd_test_live at T230 and just after expiry at T240) * * 299: (latest possible /dev/watchdog firing due to wdmd checking expiry just * before the expiry at T239) * * 300: another host can acquire leases held by host_id * This is host_dead_seconds (220) after last successful renewal (T80) */ /* * killing pids * * From the time sanlock enters recovery mode and starts killing pids at T120, * until /dev/watchdog fires between T170 and T179, we need to attempt to * gracefully kill pids for some time, and then leave around 10 seconds to * escalate to SIGKILL and clean up leases from the exited pids. * * Working backward from the earlier watchdog firing at T170, leaving 10 seconds * for SIGKILL to succeed, we need to begin SIGKILL at T160. This means we * have from T120 to T160 to allow graceful kill to complete. So, kill_grace_seconds * should be set to 40 by default (T120 to T160). * * T40: last successful disk renewal * T120 - T159: graceful pid shutdown (40 sec) * T160 - T169: SIGKILL once per second (10 sec) * T170 - T179: watchdog fires sometime (SIGKILL continues) * T180: other hosts acquire our leases */ /* * "delta" refers to timed based leases described in Chockler/Malkhi that * we use for host_id ownership. * * "paxos" refers to disk paxos based leases described in Lamport that * we use for resource (vm) ownership. * * "free" refers to a lease (either type) that is not owned by anyone * * "held" refers to a lease (either type) that was owned by a host that * failed, so it was not released/freed. . (if a renewal fails we always attempt another renewal immediately) * * "max" refers to the maximum time that a successful acquire/renew can * take, assuming that every io operation takes the max allowable time * (io_timeout_seconds) * * "min" refers to the minimum time that a successful acquire/renew can * take, assuming that every io operation completes immediately, in * effectively zero time * * * io_timeout_seconds: defined by us * * id_renewal_seconds: defined by us * * id_renewal_fail_seconds: defined by us * * watchdog_fire_timeout: /dev/watchdog will fire without being petted this long * = 60 constant * * host_dead_seconds: the length of time from the last successful host_id * renewal until that host is killed by its watchdog. * = id_renewal_fail_seconds + watchdog_fire_timeout * * delta_large_delay: from the algorithm * = id_renewal_seconds + (6 * io_timeout_seconds) * * delta_short_delay: from the algorithm * = 2 * io_timeout_seconds * * delta_acquire_held_max: max time it can take to successfully * acquire a non-free delta lease * = io_timeout_seconds (read) + * max(delta_large_delay, host_dead_seconds) + * io_timeout_seconds (read) + * io_timeout_seconds (write) + * delta_short_delay + * io_timeout_seconds (read) * * delta_acquire_held_min: min time it can take to successfully * acquire a non-free delta lease * = max(delta_large_delay, host_dead_seconds) * * delta_acquire_free_max: max time it can take to successfully * acquire a free delta lease. * = io_timeout_seconds (read) + * io_timeout_seconds (write) + * delta_short_delay + * io_timeout_seconds (read) * * delta_acquire_free_min: min time it can take to successfully * acquire a free delta lease. * = delta_short_delay * * delta_renew_max: max time it can take to successfully * renew a delta lease. * = io_timeout_seconds (read) + * io_timeout_seconds (write) * * delta_renew_min: min time it can take to successfully * renew a delta lease. * = 0 * * paxos_acquire_held_max: max time it can take to successfully * acquire a non-free paxos lease, uncontended. * = io_timeout_seconds (read leader) + * host_dead_seconds + * io_timeout_seconds (read leader) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write leader) * * paxos_acquire_held_min: min time it can take to successfully * acquire a non-free paxos lease, uncontended. * = host_dead_seconds * * paxos_acquire_free_max: max time it can take to successfully * acquire a free paxos lease, uncontended. * = io_timeout_seconds (read leader) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write dblock) + * io_timeout_seconds (read dblocks) + * io_timeout_seconds (write leader) * * paxos_acquire_free_min: min time it can take to successfully * acquire a free paxos lease, uncontended. * = 0 * * * How to configure the combination of related timeouts defined by us: * io_timeout_seconds * id_renewal_seconds * id_renewal_fail_seconds * * Here's one approach that seems to produce sensible sets of numbers: * * io_timeout_seconds = N * . max time one io can take * * delta_renew_max = 2N * . max time one renewal can take * * id_renewal_seconds = delta_renew_max (2N) * . delay this long after renewal success before next renew attempt begins * . this will be the difference between two successive renewal timestamps * when io times are effectively 0 * . there's no particular reason for it to be 2N exactly * . if a successful renewal takes the max possible time (delta_renew_max), * then the next renewal attempt will begin right away * . (if a renewal fails we always attempt another renewal immediately) * * id_renewal_fail_seconds = 4 * delta_renew_max (8N) * . time from last successful renewal until recovery begins * . allows for three consecutive max len renewal failures, i.e. * id_renewal_seconds + (3 * delta_renew_max) * * id_renewal_warn_seconds = 3 * delta_renew_max (6N) * . time from last successful renewal until warning about renewal length * . allows for two consecutive max len renewal failues * * T time in seconds * 0 renewal ok * 2N renewal attempt begin * 4N renewal attempt fail1 (each io takes max time) * 4N renewal attempt begin * 6N renewal attempt fail2 (each io takes max time) * 6N renewal attempt begin * 8N renewal attempt fail3 (each io takes max time) * 8N recovery begins (pids killed) * * If ios don't take the max len (delta_renew_max), this just * gives us more attempts to renew before recovery begins. * * io_timeout_seconds N 5 10 20 * id_renewal_seconds 2N 10 20 40 * id_renewal_fail_seconds 8N 40 80 160 * * 5 sec io timeout: fast storage io perf * 10 sec io timeout: normal storage io perf * 20 sec io timeout: slow storage io perf * * [We could break down these computations further by adding a variable * F = number of full len renewal failures allowed before recovery * begins. Above F is fixed at 3, but we may want to vary it to be * 2 or 4.] * * fast norm slow * watchdog_fire_timeout 60 60 60 * * io_timeout_seconds 5 10 20 * id_renewal_seconds 10 20 40 * id_renewal_fail_seconds 40 80 160 * id_renewal_warn_seconds 30 60 120 * * host_dead_seconds 100 140 220 * delta_large_delay 40 80 160 * delta_short_delay 10 20 40 * delta_acquire_held_max 130 200 340 * delta_acquire_held_min 100 140 220 * delta_acquire_free_max 25 50 100 * delta_acquire_free_min 10 20 40 * delta_renew_max 10 20 40 * delta_renew_min 0 0 0 * paxos_acquire_held_max 135 210 360 * paxos_acquire_held_min 100 140 220 * paxos_acquire_free_max 30 60 120 * paxos_acquire_free_min 0 0 0 */ /* * Why does delta_acquire use max(delta_large_delay, host_dead_seconds) instead * of just delta_large_delay as specified in the algorithm? * * 1. the time based lease algorithm uses delta_large_delay to determine that a * host is failed, but we want to be more certain the host is dead based on its * watchdog firing, and we know the watchdog has fired after host_dead_seconds. * * 2. if a delta lease can be acquired and released (freed) before * host_dead_seconds, that could allow the paxos leases of a failed host to be * acquired by someone else before host_dead_seconds (and before the failed * host is really dead), because acquiring a held paxos lease depends on the * delta lease of the failed owner not changing for host_dead_seconds. * We cannot allow a host to acquire another failed host's paxos lease before * host_dead_seconds. * * 3. ios can't be reliably canceled and never really time out; an io is only * really dead when the machine is dead/reset or storage access is cut off. * The delta lease algorithm expects real io timeouts. * * So, the delay is really meant to represent the time until we are certain a * host is safely gone and will no longer write, and for sanlock that means * until the watchdog has reset it. */ #ifndef __TIMEOUTS_H__ #define __TIMEOUTS_H__ int calc_host_dead_seconds(int io_timeout); int calc_id_renewal_seconds(int io_timeout); int calc_id_renewal_fail_seconds(int io_timeout); int calc_id_renewal_warn_seconds(int io_timeout); int calc_set_bitmap_seconds(int io_timeout); void log_timeouts(int io_timeout_arg); #endif sanlock-3.8.2/src/watchdog.c000066400000000000000000000070331371427612200157210ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock_internal.h" #include "log.h" #include "watchdog.h" /* * Purpose of watchdog: to forcibly reset the host in the case where a * supervised pid is running but sanlock daemon does not renew its lease * and does not kill the pid (or it kills the pid but the pid does not * exit). So, just before the pid begins running with granted leases, * /dev/watchdog needs to be armed to reboot the host if things go bad right * after the pid goes ahead. */ #include "../wdmd/wdmd.h" void update_watchdog(struct space *sp, uint64_t timestamp, int id_renewal_fail_seconds) { int rv; if (!com.use_watchdog) return; rv = wdmd_test_live(sp->wd_fd, timestamp, timestamp + id_renewal_fail_seconds); if (rv < 0) log_erros(sp, "wdmd_test_live %llu failed %d", (unsigned long long)timestamp, rv); } int connect_watchdog(struct space *sp) { int con; if (!com.use_watchdog) return 0; con = wdmd_connect(); if (con < 0) { log_erros(sp, "wdmd_connect failed %d", con); return -1; } return con; } int activate_watchdog(struct space *sp, uint64_t timestamp, int id_renewal_fail_seconds, int con) { char name[WDMD_NAME_SIZE]; int test_interval, fire_timeout; uint64_t last_keepalive; int rv; if (!com.use_watchdog) return 0; memset(name, 0, sizeof(name)); snprintf(name, WDMD_NAME_SIZE - 1, "sanlock_%s:%llu", sp->space_name, (unsigned long long)sp->host_id); rv = wdmd_register(con, name); if (rv < 0) { log_erros(sp, "wdmd_register failed %d", rv); goto fail_close; } /* the refcount tells wdmd that it should not cleanly exit */ rv = wdmd_refcount_set(con); if (rv < 0) { log_erros(sp, "wdmd_refcount_set failed %d", rv); goto fail_close; } rv = wdmd_status(con, &test_interval, &fire_timeout, &last_keepalive); if (rv < 0) { log_erros(sp, "wdmd_status failed %d", rv); goto fail_clear; } if (fire_timeout != WATCHDOG_FIRE_TIMEOUT) { log_erros(sp, "wdmd invalid fire_timeout %d vs %d", fire_timeout, WATCHDOG_FIRE_TIMEOUT); goto fail_clear; } rv = wdmd_test_live(con, timestamp, timestamp + id_renewal_fail_seconds); if (rv < 0) { log_erros(sp, "wdmd_test_live in create failed %d", rv); goto fail_clear; } sp->wd_fd = con; return 0; fail_clear: wdmd_refcount_clear(con); fail_close: close(con); return -1; } void deactivate_watchdog(struct space *sp) { int rv; if (!com.use_watchdog) return; log_space(sp, "wdmd_test_live 0 0 to disable"); rv = wdmd_test_live(sp->wd_fd, 0, 0); if (rv < 0) { log_erros(sp, "wdmd_test_live in deactivate failed %d", rv); /* We really want this to succeed to avoid a reset, so retry after a short delay in case the problem was transient... */ usleep(500000); rv = wdmd_test_live(sp->wd_fd, 0, 0); if (rv < 0) log_erros(sp, "wdmd_test_live in deactivate 2 failed %d", rv); } wdmd_refcount_clear(sp->wd_fd); } void close_watchdog(struct space *sp) { if (!com.use_watchdog) return; close(sp->wd_fd); } sanlock-3.8.2/src/watchdog.h000066400000000000000000000012151371427612200157220ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #ifndef __WATCHDOG_H__ #define __WATCHDOG_H__ void update_watchdog(struct space *sp, uint64_t timestamp, int id_renewal_fail_seconds); int connect_watchdog(struct space *sp); int activate_watchdog(struct space *sp, uint64_t timestamp, int id_renewal_fail_seconds, int con); void deactivate_watchdog(struct space *sp); void close_watchdog(struct space *sp); #endif sanlock-3.8.2/tests/000077500000000000000000000000001371427612200143255ustar00rootroot00000000000000sanlock-3.8.2/tests/Makefile000066400000000000000000000027311371427612200157700ustar00rootroot00000000000000TARGET1 = devcount TARGET2 = sanlk_load TARGET3 = sanlk_client TARGET4 = killpath TARGET5 = sanlk_path TARGET6 = sanlk_testr TARGET7 = sanlk_events SOURCE1 = devcount.c SOURCE2 = sanlk_load.c SOURCE3 = sanlk_client.c SOURCE4 = killpath.c SOURCE5 = sanlk_path.c SOURCE6 = sanlk_testr.c SOURCE7 = sanlk_events.c CFLAGS += -D_GNU_SOURCE -g \ -Wall \ -Wformat \ -Wformat-security \ -Wnested-externs \ -Wpointer-arith \ -Wextra -Wshadow \ -Wcast-align \ -Wwrite-strings \ -Waggregate-return \ -Wstrict-prototypes \ -Winline \ -Wredundant-decls \ -Wno-sign-compare \ -Wp,-D_FORTIFY_SOURCE=2 \ -O2 \ -fexceptions \ -fasynchronous-unwind-tables \ -fdiagnostics-show-option LDFLAGS = -lrt -laio -lblkid -lsanlock all: $(TARGET1) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) $(TARGET6) $(TARGET7) $(TARGET1): $(SOURCE1) $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -L. -I../src -L../src $(TARGET2): $(SOURCE2) $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -L. -I../src -L../src $(TARGET3): $(SOURCE3) $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -L. -I../src -L../src $(TARGET4): $(SOURCE4) $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -L. -I../src -L../src $(TARGET5): $(SOURCE5) $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -L. -I../src -L../src $(TARGET6): $(SOURCE6) $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -L. -I../src -L../src $(TARGET7): $(SOURCE7) $(CC) $(CFLAGS) $(LDFLAGS) $< -o $@ -L. -I../src -L../src clean: rm -f *.o *.so *.so.* $(TARGET) $(TARGET2) $(TARGET3) $(TARGET4) $(TARGET5) $(TARGET6) $(TARGET7) sanlock-3.8.2/tests/__init__.py000066400000000000000000000000001371427612200164240ustar00rootroot00000000000000sanlock-3.8.2/tests/clientn000077500000000000000000000067541371427612200157230ustar00rootroot00000000000000#!/bin/bash num=$1 cmd=$2 dev=$3 if [ "$cmd" == "init" ]; then echo sanlock direct init -s test:0:$dev:0 sanlock direct init -s test:0:$dev:0 for i in `seq 1 $num`; do off=`expr $i \* 1048576` echo sanlock direct init -r test:r$i:$dev:$off sanlock direct init -r test:r$i:$dev:$off done elif [ "$cmd" == "start" ]; then hostid=$4 killpath=$5 echo sanlock client add_lockspace -s test:$hostid:$dev:0 sanlock client add_lockspace -s test:$hostid:$dev:0 for i in `seq 1 $num`; do off=`expr $i \* 1048576` echo ./sanlk_client test r$i $dev $off $killpath & ./sanlk_client test r$i $dev $off $killpath & done elif [ "$cmd" == "delay" ]; then sec=$3 pid=`cat /run/sanlock/sanlock.pid` echo sync with daemon renewals kill -s SIGSTOP $pid sleep 20 kill -s SIGCONT $pid sleep 1 echo sigstop sanlock pid $pid kill -s SIGSTOP $pid echo sleep $sec sleep $sec echo sigcont sanlock pid $pid kill -s SIGCONT $pid elif [ "$cmd" == "iodelay" ]; then sec=$4 pid=`cat /run/sanlock/sanlock.pid` echo sync with daemon renewals kill -s SIGSTOP $pid sleep 20 kill -s SIGCONT $pid sleep 2 echo save linear rm -f /tmp/client-state.txt rm -f /tmp/client-linear.txt rm -f /tmp/client-error.txt dmsetup table $dev > /tmp/client-linear.txt sed "s/linear/error/" /tmp/client-linear.txt > /tmp/client-error.txt echo load error dmsetup suspend $dev dmsetup load $dev /tmp/client-error.txt dmsetup resume $dev echo sleep $sec sleep $sec echo load linear dmsetup suspend $dev dmsetup load $dev /tmp/client-linear.txt dmsetup resume $dev elif [ "$cmd" == "error" ]; then echo save linear rm -f /tmp/client-state.txt rm -f /tmp/client-linear.txt rm -f /tmp/client-error.txt dmsetup table $dev > /tmp/client-linear.txt sed "s/linear/error/" /tmp/client-linear.txt > /tmp/client-error.txt echo load error dmsetup suspend $dev dmsetup load $dev /tmp/client-error.txt dmsetup resume $dev elif [ "$cmd" == "linear" ]; then echo load linear dmsetup suspend $dev dmsetup load $dev /tmp/client-linear.txt dmsetup resume $dev elif [ "$cmd" == "resume" ]; then hostid=$4 echo load linear dmsetup suspend $dev dmsetup load $dev /tmp/client-linear.txt dmsetup resume $dev echo sanlock client add_lockspace -s test:$hostid:$dev:0 sanlock client add_lockspace -s test:$hostid:$dev:0 while read pid state; do echo sanlock client acquire -p $pid -r $state sanlock client acquire -p $pid -r $state ret=$? if [ $ret == 0 ]; then kill -s SIGCONT $pid else kill -s SIGKILL $pid fi done < /tmp/client-state.txt else echo "" echo "clientn N init DEV" echo " sanlock direct init -s test:0:DEV:0" echo " sanlock direct init -r test:rI:DEV:OFF" echo "" echo "clientn N start DEV HOSTID KILLPATH" echo " sanlock client add_lockspace -s test:HOSTID:DEV:0" echo " starts N ./sanlk_client processes" echo "" echo "clientn N delay SEC" echo " sigstop sanlock daemon" echo " sleep SEC" echo " sigcont sanlock daemon" echo "" echo "clientn N iodelay DEV SEC" echo " block i/o to DEV" echo " sleep SEC" echo " unblock i/o to DEV" echo "" echo "clientn N linear DEV" echo " unblock i/o to DEV" echo "" echo "clientn N error DEV" echo " blocks i/o to DEV" echo " causes KILLPATH to run" echo " causes lockspace to be removed" echo "" echo "clientn N resume DEV HOSTID" echo " sanlock client add_lockspace -s test:HOSTID:DEV:0" echo " reacquires leases for sanlk_client pids paused by" echo " killpath_pause, based on inquire state saved by killpath" fi sanlock-3.8.2/tests/conftest.py000066400000000000000000000027571371427612200165370ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ Fixtures for sanlock testing. """ from __future__ import absolute_import import os import pytest from . import storage from . import util class SanlockIsRunning(Exception): """ Raised if sanlock running when it should not """ @pytest.fixture def sanlock_daemon(): """ Run sanlock daemon during a test. """ p = util.start_daemon() try: util.wait_for_daemon(0.5) yield finally: # Killing sanlock allows terminating without reomving the lockspace, # which takes about 3 seconds, slowing down the tests. p.kill() p.wait() @pytest.fixture(params=[ pytest.param(storage.BLOCK, id="block"), pytest.param(storage.FILE, id="file"), ]) def user_4k_path(request): """ A path to block device or file on file system on top of 4k block device, provided by the user. If storage is not available, skip the tests. """ if not os.path.exists(request.param): pytest.skip( "user storage available - run 'python tests/strorage.py setup' " "to enable 4k storage tests") return request.param @pytest.fixture def no_sanlock_daemon(): if util.sanlock_is_running(): raise SanlockIsRunning sanlock-3.8.2/tests/constants.py000066400000000000000000000012641371427612200167160ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ Constants copied from sanlock source. """ # src/leader.h PAXOS_DISK_MAGIC = 0x06152010 PAXOS_DISK_CLEAR = 0x11282016 DELTA_DISK_MAGIC = 0x12212010 # src/rindex_disk.h RINDEX_DISK_MAGIC = 0x01042018 # src/rindex_disk.h # Copied from the docs module comment. RINDEX_ENTRY_SIZE = 64 RINDEX_ENTRIES_SECTORS = 2000 # src/sanlock_rv.h SANLK_LEADER_MAGIC = -223 # src/sanlock.h SANLK_PATH_LEN = 1024 sanlock-3.8.2/tests/daemon_test.py000066400000000000000000000165741371427612200172160ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ Test sanlock client operations. """ from __future__ import absolute_import import io import signal import struct import pytest from . constants import ( DELTA_DISK_MAGIC, PAXOS_DISK_MAGIC, PAXOS_DISK_CLEAR, RINDEX_DISK_MAGIC, RINDEX_ENTRIES_SECTORS, RINDEX_ENTRY_SIZE ) from . import util from . units import MiB def test_single_instance(sanlock_daemon): # Starting another instance while the daemon must fail. p = util.start_daemon() try: util.wait_for_termination(p, 1.0) except util.TimeoutExpired: p.kill() p.wait() assert p.returncode == 1 def test_start_after_kill(): # After killing the daemon, next instance should be able to start. for i in range(5): p = util.start_daemon() try: util.wait_for_daemon(0.5) finally: p.kill() p.wait() assert p.returncode == -signal.SIGKILL def test_client_failure(): # No daemon is running, client must fail with pytest.raises(util.CommandError) as e: util.sanlock("client", "status") assert e.value.returncode == 1 def test_init_lockspace(tmpdir, sanlock_daemon): path = tmpdir.join("lockspace") size = MiB util.create_file(str(path), size) lockspace = "name:1:%s:0" % path util.sanlock("client", "init", "-s", lockspace) with io.open(str(path), "rb") as f: magic, = struct.unpack("< I", f.read(4)) assert magic == DELTA_DISK_MAGIC # TODO: check more stuff here... util.check_guard(str(path), size) def test_init_resource(tmpdir, sanlock_daemon): path = tmpdir.join("resources") size = MiB util.create_file(str(path), size) resource = "ls_name:res_name:%s:0" % path util.sanlock("client", "init", "-r", resource) with io.open(str(path), "rb") as f: magic, = struct.unpack("< I", f.read(4)) assert magic == PAXOS_DISK_MAGIC # TODO: check more stuff here... util.check_guard(str(path), size) def test_format(tmpdir, sanlock_daemon): path = tmpdir.join("rindex") size = 3 * MiB util.create_file(str(path), size) rindex = "ls_name:%s:1M" % path util.sanlock("client", "format", "-x", rindex) with io.open(str(path), "rb") as f: # The first slot should contain the rindex header sector. f.seek(MiB) magic, = struct.unpack("< I", f.read(4)) assert magic == RINDEX_DISK_MAGIC # The rindex entries starts at the second rindex slot sector. All # entries should be zeroed. f.seek(MiB + 512) entries_size = 512 * RINDEX_ENTRIES_SECTORS assert f.read(entries_size) == b"\0" * entries_size # The next slot should contain the internal lease. f.seek(2 * MiB) magic, = struct.unpack("< I", f.read(4)) assert magic == PAXOS_DISK_MAGIC util.check_guard(str(path), size) def test_create(tmpdir, sanlock_daemon): path = tmpdir.join("rindex") # Slots: lockspace rindex master-lease user-lease-1 size = 4 * MiB util.create_file(str(path), size) # Note: using 1 second io timeout (-o 1) for quicker tests. lockspace = "ls_name:1:%s:0" % path util.sanlock("client", "init", "-s", lockspace, "-o", "1") rindex = "ls_name:%s:1M" % path util.sanlock("client", "format", "-x", rindex) util.sanlock("client", "add_lockspace", "-s", lockspace, "-o", "1") util.sanlock("client", "create", "-x", rindex, "-e", "res") with io.open(str(path), "rb") as f: # New entry should be created at the first slot # The first rindex sector is used by the rindex header. f.seek(MiB + 512) util.check_rindex_entry(f.read(RINDEX_ENTRY_SIZE), b"res", 3 * MiB, 0) # The rest of the entries should not be modified. rest = 512 * RINDEX_ENTRIES_SECTORS - RINDEX_ENTRY_SIZE assert f.read(rest) == b"\0" * rest # The next slot should contain the internal lease. f.seek(3 * MiB) magic, = struct.unpack("< I", f.read(4)) assert magic == PAXOS_DISK_MAGIC util.check_guard(str(path), size) def test_delete(tmpdir, sanlock_daemon): path = tmpdir.join("rindex") # Slots: lockspace rindex master-lease user-lease-1 size = 4 * MiB util.create_file(str(path), size) # Note: using 1 second io timeout (-o 1) for quicker tests. lockspace = "ls_name:1:%s:0" % path util.sanlock("client", "init", "-s", lockspace, "-o", "1") rindex = "ls_name:%s:1M" % path util.sanlock("client", "format", "-x", rindex) util.sanlock("client", "add_lockspace", "-s", lockspace, "-o", "1") util.sanlock("client", "create", "-x", rindex, "-e", "res") util.sanlock("client", "delete", "-x", rindex, "-e", "res") with io.open(str(path), "rb") as f: # First entry should be cleared. f.seek(MiB + 512) util.check_rindex_entry(f.read(RINDEX_ENTRY_SIZE), b"", 0, 0) # Rest of entires should not be modified. rest = 512 * RINDEX_ENTRIES_SECTORS - RINDEX_ENTRY_SIZE assert f.read(rest) == b"\0" * rest # The next slot should contain a cleared lease. f.seek(3 * MiB) magic, = struct.unpack("< I", f.read(4)) assert magic == PAXOS_DISK_CLEAR util.check_guard(str(path), size) def test_lookup(tmpdir, sanlock_daemon): path = tmpdir.join("rindex") # Slots: lockspace rindex master-lease user-lease-1 ... user-lease-7 size = 10 * MiB util.create_file(str(path), size) # Note: using 1 second io timeout (-o 1) for quicker tests. lockspace = "ls_name:1:%s:0" % path util.sanlock("client", "init", "-s", lockspace, "-o", "1") rindex = "ls_name:%s:1M" % path util.sanlock("client", "format", "-x", rindex) util.sanlock("client", "add_lockspace", "-s", lockspace, "-o", "1") util.sanlock("client", "create", "-x", rindex, "-e", "res") lookup = util.sanlock("client", "lookup", "-x", rindex, "-e", "res") assert lookup == b"lookup done 0\nname res offset 3145728\n" def test_lookup_uninitialized(tmpdir, sanlock_daemon): path = tmpdir.join("rindex") util.create_file(str(path), MiB) rindex = "ls_name:%s:1M" % path with pytest.raises(util.CommandError) as e: util.sanlock("client", "lookup", "-x", rindex, "-e", "res") assert e.value.returncode == 1 assert e.value.stdout == b"lookup done -2\n" assert e.value.stderr == b"" def test_lookup_missing(tmpdir, sanlock_daemon): path = tmpdir.join("rindex") # Slots: lockspace rindex master-lease user-lease-1 ... user-lease-7 size = 10 * MiB util.create_file(str(path), size) # Note: using 1 second io timeout (-o 1) for quicker tests. lockspace = "ls_name:1:%s:0" % path util.sanlock("client", "init", "-s", lockspace, "-o", "1") rindex = "ls_name:%s:1M" % path util.sanlock("client", "format", "-x", rindex) util.sanlock("client", "add_lockspace", "-s", lockspace, "-o", "1") with pytest.raises(util.CommandError) as e: util.sanlock("client", "lookup", "-x", rindex, "-e", "res") assert e.value.returncode == 1 assert e.value.stdout == b"lookup done -2\n" assert e.value.stderr == b"" sanlock-3.8.2/tests/devcount-dmsetup000077500000000000000000000007731371427612200175700ustar00rootroot00000000000000#!/bin/bash if [ $# -le 1 ]; then echo "num $#" echo "" echo "devcount-dmsetup save " echo "" echo "devcount-dmsetup error " echo "" echo "devcount-dmsetup linear " echo "" fi cmd=$1 dev=$2 if [ "$cmd" == "save" ]; then rm -f /tmp/table-linear.txt rm -f /tmp/table-error.txt dmsetup table $dev > /tmp/table-linear.txt sed "s/linear/error/" /tmp/table-linear.txt > /tmp/table-error.txt exit 0 fi dmsetup suspend $dev dmsetup load $dev /tmp/table-$cmd.txt dmsetup resume $dev sanlock-3.8.2/tests/devcount.c000066400000000000000000001045401371427612200163240ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" #include "sanlock_direct.h" #define ONEMB 1048576 #define LEASE_SIZE ONEMB FILE *turn_file; char count_path[PATH_MAX]; char lock_path[PATH_MAX]; int count_offset; int lock_offset; int our_hostid; int max_hostid; struct sanlk_lockspace lockspace; struct entry { uint32_t turn; uint32_t hostid; uint64_t pid; uint64_t time; uint64_t count; uint32_t last_turn; uint32_t last_hostid; uint64_t last_pid; uint64_t last_time; uint64_t last_count; }; #define log_debug(fmt, args...) \ do { \ printf("%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ printf("ERROR %llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) static void sigterm_handler(int sig) { log_debug("sigterm_handler %d", sig); } static void setup_sigterm(void) { struct sigaction act; memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; sigaction(SIGTERM, &act, NULL); } static int kill_pid(int pid) { int rv, status; kill(pid, SIGKILL); while (1) { rv = waitpid(pid, &status, 0); if (rv < 0) return -1; if (rv != pid) return -2; if (WIFEXITED(status)) return 0; } } /* kill(pid, SIGSTOP) would be nice, but that won't guarantee the pid has finished all i/o when it returns. Instead, we send SIGUSR1, which child sees after it's done with a (synchronous) write, and calls SIGSTOP on itself */ static void pause_pid(int pid, int child_stderr) { char buf[64]; int rv; kill(pid, SIGUSR1); /* child prints "we_are_paused" to stderr before stopping */ memset(buf, 0, sizeof(buf)); rv = read(child_stderr, buf, sizeof(buf)); if (strstr(buf, "we_are_paused")) return; while (1) { log_error("pause_pid %d read %s", pid, buf); sleep(2); } } static void resume_pid(int pid) { kill(pid, SIGCONT); } static int check_pause(int fd) { struct signalfd_siginfo fdsi; ssize_t rv; rv = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); if (rv != sizeof(struct signalfd_siginfo)) { return 0; } if (fdsi.ssi_signo == SIGUSR1) { return 1; } return 0; } static int setup_pause(void) { sigset_t mask; int fd, rv; sigemptyset(&mask); sigaddset(&mask, SIGUSR1); rv = sigprocmask(SIG_BLOCK, &mask, NULL); if (rv < 0) return rv; fd = signalfd(-1, &mask, SFD_NONBLOCK); if (fd < 0) return -errno; return fd; } static int rand_int(int a, int b) { return a + (int) (((float)(b - a + 1)) * random() / (RAND_MAX+1.0)); } /* 64 byte entry: can fit up to 8 nodes in a 512 byte block */ void print_entries(char *path, int pid, char *buf) { struct entry *e = (struct entry *)buf; int i; for (i = 0; i < (512 / sizeof(struct entry)); i++) { log_error("%s c %d index %d turn %u time %llu %u:%llu:%llu " "last %u %llu %u:%llu:%llu", path, pid, i, e->turn, (unsigned long long)e->time, e->hostid, (unsigned long long)e->pid, (unsigned long long)e->count, e->last_turn, (unsigned long long)e->last_time, e->last_hostid, (unsigned long long)e->last_pid, (unsigned long long)e->last_count); e++; } } void print_our_we(char *path, int pid, int writes, struct entry *our_we, const char *stage) { log_debug("%s c %d %s w %d index %d turn %u time %llu %u:%llu:%llu " "last %u %llu %u:%llu:%llu", path, pid, stage, writes, our_hostid - 1, our_we->turn, (unsigned long long)our_we->time, our_we->hostid, (unsigned long long)our_we->pid, (unsigned long long)our_we->count, our_we->last_turn, (unsigned long long)our_we->last_time, our_we->last_hostid, (unsigned long long)our_we->last_pid, (unsigned long long)our_we->last_count); } #define COUNT_ARGS 6 #define LOCK_ARGS 8 #define MIGRATE_ARGS 9 /* * devcount rw|wr */ static int do_count(int argc, char *argv[]) { char *rbuf, **p_rbuf, *wbuf, **p_wbuf, *vbuf, **p_vbuf; struct entry *re, *max_re, *our_we; int i, fd, rv, error, max_i; int pause_fd; time_t start; uint32_t our_pid = getpid(); uint32_t max_turn; int sec1, sec2; int read_seconds, write_seconds; uint32_t writes = 0; if (argc < COUNT_ARGS) return -1; pause_fd = setup_pause(); strcpy(count_path, argv[2]); sec1 = atoi(argv[3]); sec2 = atoi(argv[4]); our_hostid = atoi(argv[5]); if (!strcmp(argv[1], "rw")) { read_seconds = sec1; write_seconds = sec2; } else { write_seconds = sec1; read_seconds = sec2; } /* printf("%d %s count_disk %s sec1 %d sec2 %d our_hostid %d\n", our_pid, argv[1], count_path, sec1, sec2, our_hostid); */ fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { perror("open failed"); error = 1; goto fail; } rv = ioctl(fd, BLKFLSBUF); if (rv) { perror("BLKFLSBUF failed"); error = 2; goto fail; } p_rbuf = &rbuf; p_wbuf = &wbuf; p_vbuf = &vbuf; rv = posix_memalign((void *)p_rbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); error = 3; goto fail; } rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); error = 4; goto fail; } rv = posix_memalign((void *)p_vbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); error = 5; goto fail; } lseek(fd, count_offset, SEEK_SET); rv = read(fd, rbuf, 512); if (rv != 512) { perror("read failed"); error = 6; goto fail; } /* print_entries(our_pid, rbuf); */ /* * reading for "rw" */ if (!strcmp(argv[1], "rw")) { for (i = 0; i < read_seconds; i++) { sleep(1); lseek(fd, count_offset, SEEK_SET); rv = read(fd, vbuf, 512); if (rv != 512) { perror("read failed"); error = 7; goto fail; } if (memcmp(rbuf, vbuf, 512)) { log_error("%s c %d rbuf:", count_path, our_pid); print_entries(count_path, our_pid, rbuf); log_error("%s c %d vbuf:", count_path, our_pid); print_entries(count_path, our_pid, vbuf); error = 8; goto fail; } } } /* * writing */ re = (struct entry *)rbuf; max_re = NULL; max_i = 0; max_turn = 0; for (i = 0; i < (512 / sizeof(struct entry)); i++) { if (!max_re || re->count > max_re->count) { max_re = re; max_i = i; } if (!max_turn || re->turn > max_turn) max_turn = re->turn; re++; } if (max_turn != max_re->turn) { log_error("%s c %d max_turn %d max_re->turn %d\n", count_path, our_pid, max_turn, max_re->turn); error = 9; goto fail; } /* printf("%d max index %d turn %d count %llu\n", our_pid, max_i, max_turn, (unsigned long long)max_re->count); */ memcpy(wbuf, rbuf, 512); our_we = (struct entry *)(wbuf + ((our_hostid - 1) * sizeof(struct entry))); our_we->last_turn = max_re->turn; our_we->last_hostid = max_re->hostid; our_we->last_pid = max_re->pid; our_we->last_time = max_re->time; our_we->last_count = max_re->count; our_we->turn = max_re->turn + 1; our_we->hostid = our_hostid; our_we->pid = our_pid; our_we->time = time(NULL); our_we->count = max_re->count + 1; lseek(fd, count_offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); error = 10; goto fail; } writes = 1; print_our_we(count_path, our_pid, writes, our_we, "begin"); start = time(NULL); while (1) { our_we->count++; our_we->time = time(NULL); lseek(fd, count_offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); error = 11; goto fail; } writes++; if (write_seconds && (our_we->time - start >= write_seconds)) break; if (!(writes % 64) && check_pause(pause_fd)) { print_our_we(count_path, our_pid, writes, our_we, "pause"); fprintf(stderr, "we_are_paused\n"); raise(SIGSTOP); /* this shouldn't appear until parent does kill(SIGCONT) */ print_our_we(count_path, our_pid, writes, our_we, "resume"); } } print_our_we(count_path, our_pid, writes, our_we, "end"); if (turn_file) { fprintf(turn_file, "turn %03u start %llu end %llu host %u pid %u\n", our_we->turn, (unsigned long long)(max_re->count + 1), (unsigned long long)our_we->count, our_hostid, our_pid); fflush(turn_file); fclose(turn_file); } /* * reading for "wr" */ if (!strcmp(argv[1], "wr")) { memcpy(rbuf, wbuf, 512); for (i = 0; i < read_seconds; i++) { sleep(1); lseek(fd, count_offset, SEEK_SET); rv = read(fd, vbuf, 512); if (rv != 512) { perror("read failed"); error = 12; goto fail; } if (memcmp(rbuf, vbuf, 512)) { log_error("%s c %d rbuf:", count_path, our_pid); print_entries(count_path, our_pid, rbuf); log_error("%s c %d vbuf:", count_path, our_pid); print_entries(count_path, our_pid, vbuf); error = 13; goto fail; } } } return 0; fail: fprintf(stderr, "error %d\n", error); while (1) { log_error("%s c %d error %d", count_path, our_pid, error); print_entries(count_path, our_pid, rbuf); print_entries(count_path, our_pid, vbuf); sleep(2); } } static int add_lockspace(void) { int rv; strcpy(lockspace.name, "devcount"); strcpy(lockspace.host_id_disk.path, lock_path); lockspace.host_id_disk.offset = lock_offset; lockspace.host_id = our_hostid; rv = sanlock_add_lockspace(&lockspace, 0); log_debug("%s p %d sanlock_add_lockspace %d", lock_path, getpid(), rv); return rv; } /* * Test inquire and acquire with version * * lock: * acquire (no lver) * if fail * goto lock; * else * goto run; * * relock: * acquire with saved lver * if fail (others may acquire in lock:) * sigkill pid; * goto lock; * else * sigcont pid; * goto run; * * run: * run rw for a while * inquire pid * save lver * sigstop pid * release ALL * goto relock * */ static int do_relock(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res, *res_inq; int i, j, pid, rv, sock, len, status; int c2p[2]; /* child to parent */ int res_count; uint32_t parent_pid = getpid(); uint64_t lver; char *state; if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = relock * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pipe(c2p); pid = fork(); if (!pid) { int child_pid = getpid(); sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } res->flags = 0; res->lver = 0; rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_debug("%s c %d sanlock_acquire error %d", count_path, child_pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, child_pid); rv = sanlock_restrict(sock, SANLK_RESTRICT_ALL); if (rv < 0) { log_error("%s c %d sanlock_restrict error %d", count_path, child_pid, sock); exit(-1); } /* make child's stderr go to parent c2p[0] */ close(2); dup(c2p[1]); close(c2p[0]); close(c2p[1]); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } run_more: /* let the child run for 30 seconds before stopping it */ for (i = 0; i < 30; i++) { rv = waitpid(pid, &status, WNOHANG); if (rv == pid) break; sleep(1); } /* we expect child to exit when it fails to acquire the lock because it's held by someone else, or rw run time is up */ if (rv == pid) goto dead_child; rv = sanlock_inquire(-1, pid, 0, &res_count, &state); if (rv == -EBUSY) { /* pid probably still busy doing acquire */ goto run_more; } if (rv == -ESTALE || rv == -ESRCH) { /* pid has exited */ goto run_more; } if (rv < 0) { log_error("%s p %d sanlock_inquire c %d error %d", count_path, parent_pid, pid, rv); goto run_more; } rv = sanlock_str_to_res(state, &res_inq); if (rv < 0) { log_error("%s p %d sanlock_str_to_res error %d %s", count_path, parent_pid, rv, state); goto fail; } lver = res_inq->lver; log_debug("%s p %d sanlock_inquire c %d lver %llu done", count_path, parent_pid, pid, (unsigned long long)lver); free(res_inq); free(state); pause_pid(pid, c2p[0]); log_debug("%s p %d paused c %d", count_path, parent_pid, pid); rv = sanlock_release(-1, pid, SANLK_REL_ALL, 0, NULL); if (rv < 0) { /* pid may have exited */ log_error("%s p %d sanlock_release c %d error %d", count_path, parent_pid, pid, rv); goto kill_child; } log_debug("%s p %d sanlock_release c %d done", count_path, parent_pid, pid); /* give a chance to someone else to acquire the lock in here */ usleep(1000000); res->flags = SANLK_RES_LVER; res->lver = lver; rv = sanlock_acquire(-1, pid, 0, 1, &res, NULL); if (!rv) { /* we got the lock back in the same version */ log_debug("%s p %d sanlock_acquire c %d lver %llu done", count_path, parent_pid, pid, (unsigned long long)lver); resume_pid(pid); goto run_more; } /* someone got the lock between our release and reacquire */ log_debug("%s p %d sanlock_acquire c %d lver %llu error %d", count_path, parent_pid, pid, (unsigned long long)lver, rv); kill_child: kill_pid(pid); log_debug("%s p %d killed c %d", count_path, parent_pid, pid); dead_child: close(c2p[0]); close(c2p[1]); sleep(rand_int(0, 1)); } fail: printf("test failed...\n"); sleep(1000000); return -1; } /* * devcount lock rw * sanlock add_lockspace -s devcount:::0 * devcount rw */ static int do_lock(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res; int i, j, pid, rv, sock, len, status; if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = lock * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pid = fork(); if (!pid) { int child_pid = getpid(); sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_debug("%s c %d sanlock_acquire error %d", count_path, child_pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, child_pid); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } waitpid(pid, &status, 0); /* TODO: goto fail if exit status is an error */ sleep(rand_int(0, 1)); } printf("test failed...\n"); sleep(1000000); return -1; } static int do_wrap(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res; int i, j, rv, sock, len; uint32_t pid = getpid(); if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = wrap * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, pid, sock); exit(-1); } rv = sanlock_restrict(sock, SANLK_RESTRICT_SIGKILL); if (rv < 0) { log_error("%s c %d sanlock_restrict error %d", count_path, pid, sock); exit(-1); } rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_error("%s c %d sanlock_acquire error %d", count_path, pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, pid); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } /* * Test migration sequence (source inquires/releases, dest acquires lver) * * dest forks (e.g. libvirtd creates qemu pid) * dest child does sanlock_register, waits for parent (e.g. qemu incoming paused) * source parent does sanlock_inquire * source parent sigstop child, sanlock_release, writes state to disk * dest parent reads state from disk, sanlock_acquire(child_pid, state.lver) * dest parent tells child to run (e.g. qemu incoming resumed) * dest child execs rw * source parent sigkill child */ static void write_migrate_incoming(char *state_in) { char target_str[32]; char state[1024]; char *wbuf, **p_wbuf; int fd, rv; int offset = 4096; int target; target = (our_hostid % max_hostid) + 1; memset(state, 0, sizeof(state)); memset(target_str, 0, sizeof(target_str)); sprintf(target_str, " target=%d", target); strcat(state, state_in); strcat(state, target_str); if (strlen(state) > 512) { printf("state string too long\n"); goto fail; } fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { perror("open failed"); goto fail; } rv = ioctl(fd, BLKFLSBUF); if (rv) { perror("BLKFLSBUF failed"); goto fail; } p_wbuf = &wbuf; rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); goto fail; } memset(wbuf, 0, 512); memcpy(wbuf, state, strlen(state)); lseek(fd, offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); goto fail; } /* printf("write_migrate_incoming \"%s\"\n", wbuf); */ close(fd); return; fail: printf("write_migrate %d failed %s\n", offset, state); sleep(10000000); } /* read incoming block until it's set and our_hostid is next */ static int wait_migrate_incoming(uint64_t *lver) { struct sanlk_resource *res; char *rbuf, **p_rbuf, *wbuf, **p_wbuf; char *target_str, *val_str; int fd, rv, val; int offset = 4096; fd = open(count_path, O_RDWR | O_DIRECT | O_SYNC, 0); if (fd < 0) { perror("open failed"); goto fail; } rv = ioctl(fd, BLKFLSBUF); if (rv) { perror("BLKFLSBUF failed"); goto fail; } p_rbuf = &rbuf; p_wbuf = &wbuf; rv = posix_memalign((void *)p_rbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); goto fail; } rv = posix_memalign((void *)p_wbuf, getpagesize(), 512); if (rv) { perror("posix_memalign failed"); goto fail; } retry: lseek(fd, offset, SEEK_SET); rv = read(fd, rbuf, 512); if (rv != 512) { perror("read failed"); goto fail; } rbuf[511] = '\0'; /* init case to get things going */ if (!rbuf[0] && our_hostid == 1) { *lver = 0; return 1; } target_str = strstr(rbuf, " target="); if (!target_str) { goto retry; } val_str = strstr(target_str, "=") + 1; if (!val_str) { goto retry; } val = atoi(val_str); if (val != our_hostid) { goto retry; } /* printf("wait_migrate_incoming \"%s\"\n", rbuf); */ *target_str = '\0'; rv = sanlock_str_to_res(rbuf, &res); if (rv < 0) { printf("str_to_res error %d\n", rv); goto fail; } *lver = res->lver; free(res); /* strcpy(state_out, rbuf); */ memset(wbuf, 0, 512); sprintf(wbuf, "%s", "empty"); lseek(fd, offset, SEEK_SET); rv = write(fd, wbuf, 512); if (rv != 512) { perror("write failed"); goto fail; } close(fd); return 0; fail: printf("wait_migrate_incoming failed\n"); sleep(10000000); return -1; } #define MAX_MIGRATE_STATE 512 /* keep in one block for simplicity */ static int do_migrate(int argc, char *argv[]) { char *av[MIGRATE_ARGS+1]; struct sanlk_resource *res; int i, j, pid, rv, sock, len, init; int p2c[2]; /* parent to child */ int c2p[2]; /* child to parent */ int res_count; uint32_t parent_pid = getpid(); uint64_t lver; char *state; if (argc < MIGRATE_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); max_hostid = atoi(argv[8]); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = migrate * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < MIGRATE_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pipe(p2c); pipe(c2p); pid = fork(); if (!pid) { int child_pid = getpid(); char junk; sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } log_debug("%s c %d wait", count_path, child_pid); read(p2c[0], &junk, 1); close(p2c[0]); close(p2c[1]); log_debug("%s c %d begin", count_path, child_pid); /* make child's stderr go to parent c2p[0] */ close(2); dup(c2p[1]); close(c2p[0]); close(c2p[1]); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } init = wait_migrate_incoming(&lver); /* from source */ if (init) { res->flags = 0; res->lver = 0; } else { res->flags = SANLK_RES_LVER; res->lver = lver; } rv = sanlock_acquire(-1, pid, 0, 1, &res, NULL); if (rv < 0) { log_error("%s p %d sanlock_acquire c %d error %d", count_path, parent_pid, pid, rv); exit(0); } log_debug("%s p %d sanlock_acquire c %d init %d lver %llu done", count_path, parent_pid, pid, init, (unsigned long long)lver); /* tell child to resume */ write(p2c[1], "\n", 1); close(p2c[0]); close(p2c[1]); /* let the child run for 10 seconds before stopping it; if the child exits before the 10 seconds, the sanlock_inquire call should return an error */ sleep(10); rv = sanlock_inquire(-1, pid, 0, &res_count, &state); if (rv < 0) { log_error("%s p %d sanlock_inquire c %d error %d", count_path, parent_pid, pid, rv); goto fail; } log_debug("%s p %d sanlock_inquire c %d done", count_path, parent_pid, pid); pause_pid(pid, c2p[0]); log_debug("%s p %d paused c %d", count_path, parent_pid, pid); rv = sanlock_release(-1, pid, SANLK_REL_ALL, 0, NULL); if (rv < 0) { log_error("%s p %d sanlock_release c %d error %d", count_path, parent_pid, pid, rv); goto fail; } log_debug("%s p %d sanlock_release c %d done", count_path, parent_pid, pid); write_migrate_incoming(state); /* to dest */ kill_pid(pid); log_debug("%s p %d killed c %d", count_path, parent_pid, pid); close(c2p[0]); close(c2p[1]); free(state); } fail: printf("test failed...\n"); sleep(10000000); return -1; } /* * dmsetup table /dev/bull/lock1 > /tmp/table-linear.txt * sed "s/linear/error/" /tmp/table-linear.txt > /tmp/table-error.txt * * dmsetup suspend /dev/bull/lock1 * dmsetup load /dev/bull/lock1 /tmp/table-error.txt * dmsetup resume /dev/bull/lock1 * * dmsetup suspend /dev/bull/lock1 * dmsetup load /dev/bull/lock1 /tmp/table-linear.txt * dmsetup resume /dev/bull/lock1 */ static void dmsetup_save_lock_disk(void) { char cmd[128]; sprintf(cmd, "./devcount-dmsetup save %s", lock_path); system(cmd); } static void dmsetup_error_lock_disk(void) { char cmd[128]; sprintf(cmd, "./devcount-dmsetup error %s", lock_path); system(cmd); } static void dmsetup_linear_lock_disk(void) { char cmd[128]; sprintf(cmd, "./devcount-dmsetup linear %s", lock_path); system(cmd); } int do_expire(int argc, char *argv[]) { char *av[COUNT_ARGS+1]; struct sanlk_resource *res; uint32_t parent_pid = getpid(); int i, j, pid, rv, sock, len, status; int c2p[2]; char result[5]; if (argc < LOCK_ARGS) return -1; count_offset = 0; strcpy(lock_path, argv[2]); strcpy(count_path, argv[4]); our_hostid = atoi(argv[7]); dmsetup_save_lock_disk(); add_lockspace(); len = sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk); res = malloc(len); memset(res, 0, len); strcpy(res->lockspace_name, lockspace.name); snprintf(res->name, SANLK_NAME_LEN, "resource%s", count_path); res->name[SANLK_NAME_LEN-1] = '\0'; res->num_disks = 1; strncpy(res->disks[0].path, lock_path, SANLK_PATH_LEN); res->disks[0].path[SANLK_PATH_LEN-1] = '\0'; res->disks[0].offset = LEASE_SIZE; /* * argv[0] = devcount * argv[1] = expire * argv[2] = * argv[3] = rw * start copying at argv[3] */ j = 0; av[j++] = strdup(argv[0]); for (i = 3; i < LOCK_ARGS; i++) av[j++] = strdup(argv[i]); av[j] = NULL; while (1) { pipe(c2p); pid = fork(); if (!pid) { int child_pid = getpid(); sock = sanlock_register(); if (sock < 0) { log_error("%s c %d sanlock_register error %d", count_path, child_pid, sock); exit(-1); } /* this acquire can take up to 90 seconds waiting for the host_id of the owner to time out */ log_debug("%s c %d sanlock_acquire begin", count_path, child_pid); rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { log_debug("%s c %d sanlock_acquire error %d", count_path, child_pid, rv); /* all hosts are trying to acquire so we expect this to acquire only sometimes; TODO: exit with an error for some rv's */ write(c2p[1], "fail", 4); close(c2p[0]); close(c2p[1]); exit(0); } log_debug("%s c %d sanlock_acquire done", count_path, child_pid); write(c2p[1], "good", 4); close(c2p[0]); close(c2p[1]); execv(av[0], av); perror("execv devcount problem"); exit(EXIT_FAILURE); } memset(&result, 0, sizeof(result)); read(c2p[0], &result, 4); close(c2p[0]); close(c2p[1]); if (strstr(result, "fail")) { /* we expect child to exit when it fails to acquire the lock because it's held by someone else */ waitpid(pid, &status, 0); goto dead_child; } /* this test should be run with sec2 set to some large value that won't run out before sanlock daemon kills rw */ sleep(rand_int(6, 100)); dmsetup_error_lock_disk(); log_debug("%s p %d disable %s", count_path, parent_pid, lock_path); /* sanlock daemon kills pid when the renewals fail; after the kill it will try to release the resource lease, which will also fail if the resource lease is on the same disk as the host_id lease. Other nodes trying to get pid's resource lease are watching our host_id for 90 seconds, after which they will take pid's resource lease. If the resource lease is on a different disk, the daemon will be able to release it after the kill, and another node will be able to take it immediately after that, without watching our host_id for 90 seconds */ /* other nodes can't rely on the daemon being able to kill rw, so they need to wait 90 seconds to ensure that the watchdog has killed the host before taking pid's resource lease. In a different test, have the daemon kill fail, causing rw to continue running until the watchdog fires, after which another host will take pid's resource lease */ waitpid(pid, &status, 0); log_debug("%s p %d waitpid c %d done", count_path, parent_pid, pid); sleep(rand_int(0, 3)); dmsetup_linear_lock_disk(); log_debug("%s p %d enable %s", count_path, parent_pid, lock_path); log_debug("%s p %d sanlock_add_lockspace begin", lock_path, parent_pid); while (1) { sleep(1); rv = add_lockspace(); if (!rv) break; } dead_child: sleep(rand_int(0, 1)); } printf("test failed...\n"); sleep(1000000); return -1; } /* * devcount init * sanlock direct init -n 8 -s devcount:0::0 * sanlock direct init -n 8 -r devcount:resource::LEASE_SIZE * dd if=/dev/zero of= bs=512 count=24 */ #define INIT_NUM_HOSTS 0 int do_init(int argc, char *argv[]) { char resbuf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_disk disk; struct sanlk_resource *res; struct sanlk_lockspace ls; char command[4096]; int rv, align_size; if (argc < 4) return -1; strcpy(count_path, argv[3]); #if 0 /* initialize host_id lease area at offset 0 */ memset(command, 0, sizeof(command)); snprintf(command, sizeof(command), "sanlock direct init -s devcount:0:%s:0", argv[2]); printf("%s\n", command); system(command); /* initialize first resource lease area at offset LEASE_SIZE */ memset(command, 0, sizeof(command)); snprintf(command, sizeof(command), "sanlock direct init -r devcount:resource%s:%s:%d", argv[3], argv[2], LEASE_SIZE); printf("%s\n", command); system(command); #else memset(&disk, 0, sizeof(disk)); strcpy(disk.path, argv[2]); align_size = sanlock_direct_align(&disk); if (align_size != LEASE_SIZE) { printf("sanlock_direct align %s error %d\n", disk.path, align_size); return -1; } memset(&ls, 0, sizeof(ls)); strcpy(ls.name, "devcount"); strcpy(ls.host_id_disk.path, argv[2]); printf("init sync\n"); rv = sanlock_direct_init(&ls, NULL, 0, INIT_NUM_HOSTS, 0); if (rv < 0) { printf("sanlock_direct_init lockspace error %d\n", rv); return -1; } printf("init async\n"); rv = sanlock_direct_init(&ls, NULL, 0, INIT_NUM_HOSTS, 1); if (rv < 0) { printf("sanlock_direct_init lockspace error %d\n", rv); return -1; } memset(resbuf, 0, sizeof(resbuf)); res = (struct sanlk_resource *)&resbuf; strcpy(res->lockspace_name, "devcount"); sprintf(res->name, "resource%s", argv[3]); res->num_disks = 1; strcpy(res->disks[0].path, argv[2]); res->disks[0].offset = LEASE_SIZE; rv = sanlock_direct_init(NULL, res, 0, INIT_NUM_HOSTS, 0); if (rv < 0) { printf("sanlock_direct_init resource error %d\n", rv); return -1; } #endif memset(command, 0, sizeof(command)); snprintf(command, sizeof(command), "dd if=/dev/zero of=%s bs=512 count=24", count_path); printf("%s\n", command); system(command); return 0; } int main(int argc, char *argv[]) { int rv; if (argc < 2) goto out; if (!strcmp(argv[1], "init")) rv = do_init(argc, argv); else if (!strcmp(argv[1], "rw") || !strcmp(argv[1], "wr")) rv = do_count(argc, argv); else if (!strcmp(argv[1], "rwsig")) { setup_sigterm(); argv[1] = "rw"; rv = do_count(argc, argv); } else if (!strcmp(argv[1], "lock")) rv = do_lock(argc, argv); else if (!strcmp(argv[1], "wrap")) rv = do_wrap(argc, argv); else if (!strcmp(argv[1], "relock")) rv = do_relock(argc, argv); else if (!strcmp(argv[1], "migrate")) rv = do_migrate(argc, argv); else if (!strcmp(argv[1], "expire")) rv = do_expire(argc, argv); if (!rv) return 0; out: /* * sanlock direct init -s devcount:0:/dev/bull/leases:0 * sanlock direct init -r devcount:resource/dev/bull/count:/dev/bull/leases:LEASE_SIZE * * host_id leases exists at offset 0 * first resource lease exists at offset LEASE_SIZE */ printf("devcount init \n"); printf(" sanlock direct init -s devcount:0::0\n"); printf(" sanlock direct init -r devcount:resource::LEASE_SIZE\n"); printf(" dd if=/dev/zero of= bs=512 count=24\n"); printf("\n"); printf("devcount rw \n"); printf(" rw: read count for sec1, looking for writes, then write for sec2\n"); printf(" wr: write count for sec1, then read for sec2, looking for writes\n"); printf("\n"); printf("devcount lock rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf("\n"); printf("devcount relock rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf(" sigstop child, inquire, release, re-acquire, sigcont|sigkill\n"); printf("\n"); printf("devcount wrap rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" sanlock_acquire, exec devcount rw\n"); printf("\n"); printf("devcount migrate rw \n"); printf(" sanlock add_lockspace -s devcount:::0\n"); printf(" loop around fork, sanlock_acquire, exec devcount rw\n"); printf("\n"); printf("devcount expire rw \n"); printf("\n"); return -1; } sanlock-3.8.2/tests/devcountn000077500000000000000000000043721371427612200162660ustar00rootroot00000000000000#!/bin/bash if [ $# -le 3 ]; then echo "" echo "Start N devcount commands" echo "" echo "devcountn N init LOCKDEV_BASE COUNTDEV_BASE" echo "devcountn N rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N lock LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N relock LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N wrap LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "devcountn N migrate LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID MAXID" echo "devcountn N expire LOCKDEV_BASE rw COUNTDEV_BASE SEC1 SEC2 HOSTID" echo "" echo "devcount LOCKDEV1 rw COUNTDEV1 ..." echo "devcount LOCKDEV2 rw COUNTDEV2 ..." echo "devcount LOCKDEV3 rw COUNTDEV3 ..." echo ... echo "devcount LOCKDEVN rw COUNTDEVN ..." echo "" echo "Examples" echo "" echo "devcountn 3 init /dev/lock /dev/count" echo " devcount init /dev/lock1 /dev/count1" echo " devcount init /dev/lock2 /dev/count2" echo " devcount init /dev/lock3 /dev/count3" echo "" echo "devcountn 3 rw /dev/count 5 5 1" echo " devcount rw /dev/count1 5 5 1" echo " devcount rw /dev/count2 5 5 1" echo " devcount rw /dev/count3 5 5 1" echo "" echo "devcountn 3 lock /dev/lock rw /dev/count 5 5 1" echo " sanlock add_lockspace -s devcount:1:/dev/lock1:0" echo " (the add_lockspace command from each subsequent devcount will fail)" echo " devcount lock /dev/lock1 rw /dev/count1 5 5 1" echo " devcount lock /dev/lock2 rw /dev/count2 5 5 1" echo " devcount lock /dev/lock3 rw /dev/count3 5 5 1" echo "" exit 0 fi num=$1 cmd1=$2 if [ "$cmd1" != "init" ]; then deva=$3 cmd2=$4 devb=$5 sec1=$6 sec2=$7 hostid=$8 maxid=$9 i=1 echo sanlock add_lockspace -s devcount:$hostid:$deva$i:0 sanlock add_lockspace -s devcount:$hostid:$deva$i:0 fi end=`expr $num - 1` for i in `seq 0 $end`; do if [ "$cmd1" == "init" ]; then deva=$3 devb=$4 echo ./devcount init $deva$i $devb$i ./devcount init $deva$i $devb$i elif [ "$cmd1" == "rw" ] || [ "$cmd1" == "wr" ]; then echo ./devcount $cmd1 $deva$i $sec1 $sec2 $hostid ./devcount $cmd1 $deva$i $sec1 $sec2 $hostid & else echo ./devcount $cmd1 $deva$i $cmd2 $devb$i $sec1 $sec2 $hostid $maxid ./devcount $cmd1 $deva$i $cmd2 $devb$i $sec1 $sec2 $hostid $maxid & fi done sanlock-3.8.2/tests/direct_test.py000066400000000000000000000101351371427612200172100ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ Test sanlock direct options. """ from __future__ import absolute_import import io import os import struct from . import constants from . import util from . units import MiB def test_init_lockspace(tmpdir): path = tmpdir.join("lockspace") size = MiB util.create_file(str(path), size) lockspace = "name:1:%s:0" % path util.sanlock("direct", "init", "-s", lockspace) with io.open(str(path), "rb") as f: magic, = struct.unpack("< I", f.read(4)) assert magic == constants.DELTA_DISK_MAGIC # TODO: check more stuff here... util.check_guard(str(path), size) def test_dump_lockspace_empty(tmpdir): path = tmpdir.join("lockspace") size = MiB util.create_file(str(path), size) lockspace = "name:1:%s:0" % path util.sanlock("direct", "init", "-s", lockspace) dump = "%s:0:1M" % path out = util.sanlock("direct", "dump", dump) lines = out.decode("utf-8").splitlines() spaces = [line.split() for line in lines] # Empty lockspace has no hosts. assert spaces == [ ['offset', 'lockspace', 'resource', 'timestamp', 'own', 'gen', 'lver'] ] def test_init_resource(tmpdir): path = tmpdir.join("resources") size = MiB util.create_file(str(path), size) resource = "ls_name:res_name:%s:0" % path util.sanlock("direct", "init", "-r", resource) with io.open(str(path), "rb") as f: magic, = struct.unpack("< I", f.read(4)) assert magic == constants.PAXOS_DISK_MAGIC # TODO: check more stuff here... util.check_guard(str(path), size) def test_dump_resources(tmpdir): path = tmpdir.join("resources") size = 8 * MiB util.create_file(str(path), size) # Write 2 resources with a hole between them. for i in [0, 2]: res = "ls_name:res_%d:%s:%dM" % (i, path, i) util.sanlock("direct", "init", "-r", res) dump = "%s:0:8M" % path out = util.sanlock("direct", "dump", dump) lines = out.decode("utf-8").splitlines() resources = [line.split() for line in lines] assert resources == [ ['offset', 'lockspace', 'resource', 'timestamp', 'own', 'gen', 'lver'], ['00000000', 'ls_name', 'res_0', '0000000000', '0000', '0000', '0'], ['02097152', 'ls_name', 'res_2', '0000000000', '0000', '0000', '0'], ] def test_dump_resources_start_before(tmpdir): path = tmpdir.join("resources") size = 8 * MiB util.create_file(str(path), size) # Write 2 resources at middle. for i in [4, 5]: res = "ls_name:res_%d:%s:%dM" % (i, path, i) util.sanlock("direct", "init", "-r", res) dump = "%s:2M:8M" % path out = util.sanlock("direct", "dump", dump) lines = out.decode("utf-8").splitlines() resources = [line.split() for line in lines] assert resources == [ ['offset', 'lockspace', 'resource', 'timestamp', 'own', 'gen', 'lver'], ['04194304', 'ls_name', 'res_4', '0000000000', '0000', '0000', '0'], ['05242880', 'ls_name', 'res_5', '0000000000', '0000', '0000', '0'], ] def test_path_with_colon(tmpdir): path = str(tmpdir.mkdir("with:colon").join("resources")) size = 8 * MiB util.create_file(path, size) # sanlock direct init does not support escaped colons in path. dirname, filename = os.path.split(path) res = "ls_name:res_0:%s:0M" % filename util.sanlock("direct", "init", "-r", res, cwd=dirname) # sanlock direct dump supports escaped colons in path. escaped_path = path.replace(":", "\\:") dump = "%s:0:8M" % escaped_path out = util.sanlock("direct", "dump", dump) lines = out.decode("utf-8").splitlines() resources = [line.split() for line in lines] assert resources == [ ['offset', 'lockspace', 'resource', 'timestamp', 'own', 'gen', 'lver'], ['00000000', 'ls_name', 'res_0', '0000000000', '0000', '0000', '0'], ] sanlock-3.8.2/tests/env.sh000066400000000000000000000010441371427612200154500ustar00rootroot00000000000000# Setup the environment for testing sanlock. # Use built libraries from source export LD_LIBRARY_PATH=$PWD/wdmd:$PWD/src # Disable privileged operations, allowing to run sanlock daemon as # non-privileged user. export SANLOCK_PRIVILEGED=0 # Use temporary sanlock run dir, usable for non-privileged user. This # is used by sanlock daemon to create a lockfile and socket, and by # sanlock clients for communicating with the daemon. export SANLOCK_RUN_DIR=/tmp/sanlock # Import sanlock extension module from source. export PYTHONPATH=$PWD/python sanlock-3.8.2/tests/killpath.c000066400000000000000000000030671371427612200163070ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" int main(int argc, char *argv[]) { FILE *out; char *cmd = argv[0]; char args[1024]; char arg[1024]; char *state = NULL; int count = 0; int i, pid, rv; if (argc > 1 && !strcmp(argv[1], "-h")) { printf("%s_args - syslog args\n", cmd); printf("%s_term - kill SIGTERM\n", cmd); printf("%s_pause - sanlock_inquire, sanlock_release\n", cmd); } openlog(cmd, LOG_PID, LOG_DAEMON); memset(args, 0, sizeof(args)); for (i = 1; i < argc; i++) { memset(arg, 0, sizeof(arg)); sprintf(arg, "%s ", argv[i]); strcat(args, arg); } pid = atoi(argv[argc-1]); if (strstr(cmd, "args")) { syslog(LOG_ERR, "pid %d args %s\n", pid, args); } else if (strstr(cmd, "term")) { rv = kill(pid, SIGTERM); syslog(LOG_ERR, "sigterm pid %d errno %d\n", pid, errno); } else if (strstr(cmd, "pause")) { rv = kill(pid, SIGSTOP); if (rv < 0) syslog(LOG_ERR, "sigstop pid %d errno %d", pid, errno); rv = sanlock_inquire(-1, pid, 0, &count, &state); syslog(LOG_ERR, "inquire pid %d rv %d count %d state %s\n", pid, rv, count, state ? state : ""); rv = sanlock_release(-1, pid, SANLK_REL_ALL, 0, NULL); syslog(LOG_ERR, "release pid %d rv %d\n", pid, rv); out = fopen("/tmp/client-state.txt", "a"); if (out) { fprintf(out, "%d %s\n", pid, state); fclose(out); } if (state) free(state); } return 0; } sanlock-3.8.2/tests/python_test.py000066400000000000000000000630061371427612200172640ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ Test sanlock python binding with sanlock daemon. """ from __future__ import absolute_import import errno import io import os import time from contextlib import contextmanager import pytest import six import sanlock from . import constants from . import util from . units import KiB, MiB, GiB, TiB # Largest file size on ext4 is 16TiB, and on xfs 500 TiB. Use 1 TiB as it is # large enough to test large offsets, and less likely to fail on developer # machine or CI slave. # See https://access.redhat.com/articles/rhel-limits LARGE_FILE_SIZE = TiB LOCKSPACE_SIZE = MiB MIN_RES_SIZE = MiB ALIGNMENT_1M = 1 * MiB ALIGNMENT_2M = 2 * MiB SECTOR_SIZE_512 = 512 SECTOR_SIZE_4K = 4 * KiB FILE_NAMES = [ # name, encoding ("ascii", None), (u"ascii", None), (u"\u05d0", None), (u"\u05d0", "utf-8"), ] LOCKSPACE_OR_RESOURCE_NAMES = [ # Bytes are supported with python 2 and 3. pytest.param(b"\xd7\x90"), # Python 2 also supports str. pytest.param( "\xd7\x90", marks=pytest.mark.skipif( six.PY3, reason="python 3 supports only bytes")), # Python 2 also supports unicode with ascii content. pytest.param( u"ascii", marks=pytest.mark.skipif( six.PY3, reason="python 3 supports only bytes")), ] @pytest.mark.parametrize("filename, encoding", FILE_NAMES) @pytest.mark.parametrize("size, offset", [ # Smallest offset. (LOCKSPACE_SIZE, 0), # Large offset. (LARGE_FILE_SIZE, LARGE_FILE_SIZE - LOCKSPACE_SIZE), ]) def test_write_lockspace( tmpdir, sanlock_daemon, filename, encoding, size, offset): path = util.generate_path(tmpdir, filename, encoding) util.create_file(path, size) # Test read and write with default alignment and sector size values. sanlock.write_lockspace(b"ls_name", path, offset=offset, iotimeout=1) ls = sanlock.read_lockspace(path, offset=offset) assert ls == {"iotimeout": 1, "lockspace": b"ls_name"} # Test read and write with explicit alignment and sector size values. sanlock.write_lockspace( b"ls_name", path, offset=offset, iotimeout=1, align=ALIGNMENT_1M, sector=SECTOR_SIZE_512) ls = sanlock.read_lockspace( path, offset=offset, align=ALIGNMENT_1M, sector=SECTOR_SIZE_512) assert ls == {"iotimeout": 1, "lockspace": b"ls_name"} acquired = sanlock.inq_lockspace( b"ls_name", 1, path, offset=offset, wait=False) assert acquired is False magic = util.read_magic(path, offset) assert magic == constants.DELTA_DISK_MAGIC # TODO: check more stuff here... util.check_guard(path, size) @pytest.mark.parametrize("align", sanlock.ALIGN_SIZE) def test_write_lockspace_4k(user_4k_path, sanlock_daemon, align): # Poison lockspace area, ensuring that previous tests will not break this # test, and sanlock does not write beyond the lockspace area. with io.open(user_4k_path, "rb+") as f: f.write(align * b"x") util.write_guard(user_4k_path, align) sanlock.write_lockspace( b"ls_name", user_4k_path, iotimeout=1, align=align, sector=SECTOR_SIZE_4K) ls = sanlock.read_lockspace( user_4k_path, align=align, sector=SECTOR_SIZE_4K) assert ls == {"iotimeout": 1, "lockspace": b"ls_name"} acquired = sanlock.inq_lockspace(b"ls_name", 1, user_4k_path, wait=False) assert acquired is False # Verify that lockspace was written. magic = util.read_magic(user_4k_path) assert magic == constants.DELTA_DISK_MAGIC # Check that sanlock did not write beyond the lockspace area. util.check_guard(user_4k_path, align) def test_write_lockspace_4k_invalid_sector_size(sanlock_daemon, user_4k_path): with pytest.raises(sanlock.SanlockException) as e: sanlock.write_lockspace( b"ls_name", user_4k_path, iotimeout=1, sector=SECTOR_SIZE_512) assert e.value.errno == errno.EINVAL def test_read_lockspace_4k_invalid_sector_size(sanlock_daemon, user_4k_path): sanlock.write_lockspace( b"ls_name", user_4k_path, iotimeout=1, sector=SECTOR_SIZE_4K) with pytest.raises(sanlock.SanlockException) as e: sanlock.read_lockspace(user_4k_path, sector=SECTOR_SIZE_512) assert e.value.errno == errno.EINVAL @pytest.mark.parametrize("filename,encoding", FILE_NAMES) @pytest.mark.parametrize("size,offset", [ # Smallest offset. (MIN_RES_SIZE, 0), # Large offset. (LARGE_FILE_SIZE, LARGE_FILE_SIZE - MIN_RES_SIZE), ]) def test_write_resource( tmpdir, sanlock_daemon, filename, encoding, size, offset): path = util.generate_path(tmpdir, filename, encoding) util.create_file(path, size) disks = [(path, offset)] # Test read and write with default alignment and sector size values. sanlock.write_resource(b"ls_name", b"res_name", disks) res = sanlock.read_resource(path, offset=offset) assert res == { "lockspace": b"ls_name", "resource": b"res_name", "version": 0 } # Test read and write with explicit alignment and sector size values. sanlock.write_resource( b"ls_name", b"res_name", disks, align=ALIGNMENT_1M, sector=SECTOR_SIZE_512) res = sanlock.read_resource( path, offset=offset, align=ALIGNMENT_1M, sector=SECTOR_SIZE_512) assert res == { "lockspace": b"ls_name", "resource": b"res_name", "version": 0 } owners = sanlock.read_resource_owners(b"ls_name", b"res_name", disks) assert owners == [] magic = util.read_magic(path, offset) assert magic == constants.PAXOS_DISK_MAGIC util.check_guard(path, size) @pytest.mark.parametrize("align", sanlock.ALIGN_SIZE) def test_write_resource_4k(sanlock_daemon, user_4k_path, align): disks = [(user_4k_path, 0)] # Poison resource area, ensuring that previous tests will not break this # test, and sanlock does not write beyond the lockspace area. with io.open(user_4k_path, "rb+") as f: f.write(align * b"x") util.write_guard(user_4k_path, align) sanlock.write_resource( b"ls_name", b"res_name", disks, align=align, sector=SECTOR_SIZE_4K) res = sanlock.read_resource( user_4k_path, align=align, sector=SECTOR_SIZE_4K) assert res == { "lockspace": b"ls_name", "resource": b"res_name", "version": 0 } owners = sanlock.read_resource_owners( b"ls_name", b"res_name", disks, align=align, sector=SECTOR_SIZE_4K) assert owners == [] # Verify that resource was written. magic = util.read_magic(user_4k_path) assert magic == constants.PAXOS_DISK_MAGIC # Check that sanlock did not write beyond the lockspace area. util.check_guard(user_4k_path, align) @pytest.mark.xfail(reason="need to investigate why the call succeed") def test_write_resource_4k_invalid_sector_size(sanlock_daemon, user_4k_path): disks = [(user_4k_path, 0)] with pytest.raises(sanlock.SanlockException) as e: sanlock.write_resource( b"ls_name", b"res_name", disks, sector=SECTOR_SIZE_512) assert e.value.errno == errno.EINVAL def test_clear_resource(tmpdir, sanlock_daemon): path = util.generate_path(tmpdir, "clear_test") util.create_file(path, MiB) disks = [(path, 0)] sanlock.write_resource(b"ls_name", b"res_name", disks) sanlock.write_resource(b"ls_name", b"res_name", disks, clear=True) with pytest.raises(sanlock.SanlockException) as e: sanlock.read_resource(path) assert e.value.errno == constants.SANLK_LEADER_MAGIC magic = util.read_magic(path) assert magic == constants.PAXOS_DISK_CLEAR util.check_guard(path, MiB) # run clear on already cleared resource sanlock.write_resource(b"ls_name", b"res_name", disks, clear=True) magic = util.read_magic(path) assert magic == constants.PAXOS_DISK_CLEAR def test_clear_empty_lockspace_resource(tmpdir, sanlock_daemon): path = util.generate_path(tmpdir, "clear_test") util.create_file(path, MiB) disks = [(path, 0)] sanlock.write_resource(b"ls_name", b"res_name", disks) # Clear with empty lockspace and resource - should succeed sanlock.write_resource(b"", b"", disks, clear=True) magic = util.read_magic(path) assert magic == constants.PAXOS_DISK_CLEAR def test_clear_empty_storage(tmpdir, sanlock_daemon): path = util.generate_path(tmpdir, "clear_test") util.create_file(path, MiB) disks = [(path, 0)] # Clear area without any resource written - should succeed sanlock.write_resource(b"ls_name", b"inval_res_name", disks, clear=True) magic = util.read_magic(path) assert magic == constants.PAXOS_DISK_CLEAR def test_read_resource_4k_invalid_sector_size(sanlock_daemon, user_4k_path): disks = [(user_4k_path, 0)] sanlock.write_resource( b"ls_name", b"res_name", disks, align=ALIGNMENT_1M, sector=SECTOR_SIZE_4K) with pytest.raises(sanlock.SanlockException) as e: sanlock.read_resource(user_4k_path, sector=SECTOR_SIZE_512) assert e.value.errno == errno.EINVAL def test_read_resource_owners_4k_invalid_sector_size( sanlock_daemon, user_4k_path): disks = [(user_4k_path, 0)] sanlock.write_resource( b"ls_name", b"res_name", disks, align=ALIGNMENT_1M, sector=SECTOR_SIZE_4K) with pytest.raises(sanlock.SanlockException) as e: sanlock.read_resource_owners( b"ls_name", b"res_name", disks, sector=SECTOR_SIZE_512) assert e.value.errno == errno.EINVAL def test_read_resource_owners_invalid_align_size(tmpdir, sanlock_daemon): path = str(tmpdir.join("path")) util.create_file(path, GiB) disks = [(path, 0)] sanlock.write_resource( b"ls_name", b"res_name", disks, align=ALIGNMENT_1M, sector=SECTOR_SIZE_512) with pytest.raises(sanlock.SanlockException) as e: sanlock.read_resource_owners( b"ls_name", b"res_name", disks, align=ALIGNMENT_2M, sector=SECTOR_SIZE_512) assert e.value.errno == errno.EINVAL @pytest.mark.parametrize("size,offset", [ # Smallest offset. (MIN_RES_SIZE, 0), # Large offset. (LARGE_FILE_SIZE, LARGE_FILE_SIZE - MIN_RES_SIZE), ]) def test_add_rem_lockspace(tmpdir, sanlock_daemon, size, offset): path = str(tmpdir.join("ls_name")) util.create_file(path, size) sanlock.write_lockspace(b"ls_name", path, offset=offset, iotimeout=1) # Since the lockspace is not acquired, we exepect to get False. acquired = sanlock.inq_lockspace( b"ls_name", 1, path, offset=offset, wait=False) assert acquired is False sanlock.add_lockspace(b"ls_name", 1, path, offset=offset, iotimeout=1) # Once the lockspace is acquired, we exepect to get True. acquired = sanlock.inq_lockspace( b"ls_name", 1, path, offset=offset, wait=False) assert acquired is True lockspaces = sanlock.get_lockspaces() assert lockspaces == [{ 'flags': 0, 'host_id': 1, 'lockspace': b'ls_name', 'offset': offset, 'path': path }] sanlock.rem_lockspace(b"ls_name", 1, path, offset=offset) # Once the lockspace is released, we exepect to get False. acquired = sanlock.inq_lockspace( b"ls_name", 1, path, offset=offset, wait=False) assert acquired is False lockspaces = sanlock.get_lockspaces() assert lockspaces == [] def test_add_rem_lockspace_async(tmpdir, sanlock_daemon): path = str(tmpdir.join("ls_name")) util.create_file(path, MiB) sanlock.write_lockspace(b"ls_name", path, iotimeout=1) acquired = sanlock.inq_lockspace(b"ls_name", 1, path, wait=False) assert acquired is False # This will take 3 seconds. sanlock.add_lockspace(b"ls_name", 1, path, iotimeout=1, wait=False) # While the lockspace is being aquired, we expect to get None. time.sleep(1) acquired = sanlock.inq_lockspace(b"ls_name", 1, path, wait=False) assert acquired is None # Once the lockspace is acquired, we exepect to get True. acquired = sanlock.inq_lockspace(b"ls_name", 1, path, wait=True) assert acquired is True # This will take about 3 seconds. sanlock.rem_lockspace(b"ls_name", 1, path, wait=False) # Wait until the lockspace change state from True to None. while sanlock.inq_lockspace(b"ls_name", 1, path, wait=False): time.sleep(1) # While the lockspace is being released, we expect to get None. acquired = sanlock.inq_lockspace(b"ls_name", 1, path, wait=False) assert acquired is None # Once the lockspace was released, we expect to get False. acquired = sanlock.inq_lockspace(b"ls_name", 1, path, wait=True) assert acquired is False @pytest.mark.parametrize("size,offset", [ # Smallest offset. (MIN_RES_SIZE, 0), # Large offset. (LARGE_FILE_SIZE, LARGE_FILE_SIZE - MIN_RES_SIZE), ]) def test_acquire_release_resource(tmpdir, sanlock_daemon, size, offset): ls_path = str(tmpdir.join("ls_name")) util.create_file(ls_path, size) res_path = str(tmpdir.join("res_name")) util.create_file(res_path, size) sanlock.write_lockspace(b"ls_name", ls_path, offset=offset, iotimeout=1) sanlock.add_lockspace(b"ls_name", 1, ls_path, offset=offset, iotimeout=1) # Host status is not available until the first renewal. with pytest.raises(sanlock.SanlockException) as e: sanlock.get_hosts(b"ls_name", 1) assert e.value.errno == errno.EAGAIN time.sleep(1) host = sanlock.get_hosts(b"ls_name", 1)[0] assert host["flags"] == sanlock.HOST_LIVE disks = [(res_path, offset)] sanlock.write_resource(b"ls_name", b"res_name", disks) res = sanlock.read_resource(res_path, offset=offset) assert res == { "lockspace": b"ls_name", "resource": b"res_name", "version": 0 } owners = sanlock.read_resource_owners(b"ls_name", b"res_name", disks) assert owners == [] fd = sanlock.register() sanlock.acquire(b"ls_name", b"res_name", disks, slkfd=fd) res = sanlock.read_resource(res_path, offset=offset) assert res == { "lockspace": b"ls_name", "resource": b"res_name", "version": 1 } owner = sanlock.read_resource_owners(b"ls_name", b"res_name", disks)[0] assert owner["host_id"] == 1 assert owner["flags"] == 0 assert owner["generation"] == 1 assert owner["io_timeout"] == 0 # Why 0? # TODO: check timestamp. host = sanlock.get_hosts(b"ls_name", 1)[0] assert host["flags"] == sanlock.HOST_LIVE assert host["generation"] == owner["generation"] sanlock.release(b"ls_name", b"res_name", disks, slkfd=fd) res = sanlock.read_resource(res_path, offset=offset) assert res == { "lockspace": b"ls_name", "resource": b"res_name", "version": 1 } owners = sanlock.read_resource_owners(b"ls_name", b"res_name", disks) assert owners == [] @pytest.mark.parametrize("align, sector", [ # Invalid alignment (KiB, sanlock.SECTOR_SIZE[0]), # Invalid sector size (sanlock.ALIGN_SIZE[0], 8 * KiB), ]) def test_write_lockspace_invalid_align_sector( tmpdir, sanlock_daemon, align, sector): path = str(tmpdir.join("lockspace")) util.create_file(path, LOCKSPACE_SIZE) with pytest.raises(ValueError): sanlock.write_lockspace(b"ls_name", path, align=align, sector=sector) @pytest.mark.parametrize("align, sector", [ # Invalid alignment (KiB, sanlock.SECTOR_SIZE[0]), # Invalid sector size (sanlock.ALIGN_SIZE[0], 8 * KiB), ]) def test_write_resource_invalid_align_sector( tmpdir, sanlock_daemon, align, sector): path = str(tmpdir.join("resources")) util.create_file(path, MIN_RES_SIZE) disks = [(path, 0)] with pytest.raises(ValueError): sanlock.write_resource( b"ls_name", b"res_name", disks, align=align, sector=sector) @pytest.mark.parametrize("disk", [ # Not a tuple - unicode and bytes: "not a tuple", b"not a tuple", u'\u05e9\u05dc\u05d5\u05dd', b"\xd7\x90", # Tuple with incorrect length: (), ("path",), ("path", 0, "extra"), # Tuple with invalid content: (0, "path"), ("path", "not an offset"), ]) def test_write_resource_invalid_disk(tmpdir, sanlock_daemon, disk): # Test parsing disks list with invalid content. disks = [disk] with pytest.raises(ValueError) as e: sanlock.write_resource(b"ls_name", b"res_name", disks) assert repr(disk) in str(e.value) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_killpath(tmpdir, sanlock_daemon, filename, encoding): cmd_path = util.generate_path(tmpdir, filename, encoding) fd = sanlock.register() sanlock.killpath(cmd_path, [cmd_path], fd) @contextmanager def raises_sanlock_errno(expected_errno=errno.ECONNREFUSED): with pytest.raises(sanlock.SanlockException) as e: yield assert e.value.errno == expected_errno @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_rem_lockspace_parse_args(no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(): sanlock.rem_lockspace(name, 1, path, 0, wait=False) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_add_lockspace_parse_args(no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(): sanlock.add_lockspace(name, 1, path, 0, wait=False) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_write_lockspace_parse_args( no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(): sanlock.write_lockspace(name, path) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_write_resource_parse_args( no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) disks = [(path, 0)] with raises_sanlock_errno(): sanlock.write_resource(name, b"res_name", disks) with raises_sanlock_errno(): sanlock.write_resource(b"ls_name", name, disks) def test_write_resource_path_length(no_sanlock_daemon): path = "x" * constants.SANLK_PATH_LEN with pytest.raises(ValueError): sanlock.write_resource(b"ls_name", b"res_name", [(path, 0)]) path = "x" * (constants.SANLK_PATH_LEN - 1) with raises_sanlock_errno(): sanlock.write_resource(b"ls_name", b"res_name", [(path, 0)]) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_release_resource_parse_args( no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) disks = [(path, 0)] with raises_sanlock_errno(): sanlock.release(name, b"res_name", disks) with raises_sanlock_errno(): sanlock.release(b"ls_name", name, disks) def test_release_resource_path_length(no_sanlock_daemon): path = "x" * constants.SANLK_PATH_LEN with pytest.raises(ValueError): sanlock.release(b"ls_name", b"res_name", [(path, 0)]) path = "x" * (constants.SANLK_PATH_LEN - 1) with raises_sanlock_errno(): sanlock.release(b"ls_name", b"res_name", [(path, 0)]) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_read_resource_owners_parse_args( no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) disks = [(path, 0)] with raises_sanlock_errno(): sanlock.read_resource_owners(name, b"res_name", disks) with raises_sanlock_errno(): sanlock.read_resource_owners(b"ls_name", name, disks) def test_read_resource_owners_path_length(no_sanlock_daemon): path = "x" * constants.SANLK_PATH_LEN with pytest.raises(ValueError): sanlock.read_resource_owners(b"ls_name", b"res_name", [(path, 0)]) path = "x" * (constants.SANLK_PATH_LEN - 1) with raises_sanlock_errno(): sanlock.read_resource_owners(b"ls_name", b"res_name", [(path, 0)]) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) def test_get_hosts_parse_args(no_sanlock_daemon, name): with raises_sanlock_errno(): sanlock.get_hosts(name, 1) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_inq_lockspace_parse_args(no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(): sanlock.inq_lockspace(name, 1, path, wait=False) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) def test_reg_event_parse_args(no_sanlock_daemon, name): with raises_sanlock_errno(): sanlock.reg_event(name) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) def test_end_event_parse_args(no_sanlock_daemon, name): with raises_sanlock_errno(errno.EALREADY): sanlock.end_event(-1, name) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) def test_set_event_parse_args(no_sanlock_daemon, name): with raises_sanlock_errno(): sanlock.set_event(name, 1, 1, 1) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_init_lockspace_parse_args( no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(errno.ENODEV): sanlock.init_lockspace(name, path) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_init_resource_parse_args(no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) disks = [(path, 0)] with raises_sanlock_errno(errno.ENOENT): sanlock.init_resource(b"ls_name", name, disks) with raises_sanlock_errno(errno.ENOENT): sanlock.init_resource(name, b"res_name", disks) def test_init_resource_path_length(no_sanlock_daemon): path = "x" * constants.SANLK_PATH_LEN with pytest.raises(ValueError): sanlock.init_resource(b"ls_name", b"res_name", [(path, 0)]) # init_resource access storage directly. path = "x" * (constants.SANLK_PATH_LEN - 1) with raises_sanlock_errno(errno.ENAMETOOLONG): sanlock.init_resource(b"ls_name", b"res_name", [(path, 0)]) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_get_alignment_parse_args(no_sanlock_daemon, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(errno.ENOENT): sanlock.get_alignment(path) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_read_lockspace_parse_args(no_sanlock_daemon, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(): sanlock.read_lockspace(path) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_read_resource_parse_args(no_sanlock_daemon, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) with raises_sanlock_errno(): sanlock.read_resource(path) def test_read_resource_path_length(no_sanlock_daemon): path = "x" * constants.SANLK_PATH_LEN with pytest.raises(ValueError): sanlock.read_resource(path) path = "x" * (constants.SANLK_PATH_LEN - 1) with raises_sanlock_errno(): sanlock.read_resource(path) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_request_parse_args(no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) disks = [(path, 0)] with raises_sanlock_errno(): sanlock.request(b"ls_name", name, disks) with raises_sanlock_errno(): sanlock.request(name, b"res_name", disks) def test_request_path_length(no_sanlock_daemon): path = "x" * constants.SANLK_PATH_LEN with pytest.raises(ValueError): sanlock.request(b"ls_name", b"res_name", [(path, 0)]) path = "x" * (constants.SANLK_PATH_LEN - 1) with raises_sanlock_errno(): sanlock.request(b"ls_name", b"res_name", [(path, 0)]) @pytest.mark.parametrize("name", LOCKSPACE_OR_RESOURCE_NAMES) @pytest.mark.parametrize("filename,encoding", FILE_NAMES) def test_acquire_parse_args(no_sanlock_daemon, name, filename, encoding): path = util.generate_path("/tmp/", filename, encoding) disks = [(path, 0)] with raises_sanlock_errno(): sanlock.acquire(b"ls_name", name, disks, pid=os.getpid()) with raises_sanlock_errno(): sanlock.acquire(name, b"res_name", disks, pid=os.getpid()) def test_acquire_path_length(no_sanlock_daemon): path = "x" * constants.SANLK_PATH_LEN with pytest.raises(ValueError): sanlock.acquire(b"ls_name", b"res_name", [(path, 0)], pid=os.getpid()) path = "x" * (constants.SANLK_PATH_LEN - 1) with raises_sanlock_errno(): sanlock.acquire(b"ls_name", b"res_name", [(path, 0)], pid=os.getpid()) sanlock-3.8.2/tests/sanlk_client.c000066400000000000000000000037431371427612200171460ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" int main(int argc, char *argv[]) { char rd[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; char path[SANLK_HELPER_PATH_LEN]; char args[SANLK_HELPER_PATH_LEN]; struct sanlk_resource *res; time_t now, last; int sock, rv, i; if (argc < 6) { printf("sanlk_client \n"); return -1; } memset(rd, 0, sizeof(rd)); memset(path, 0, sizeof(path)); memset(args, 0, sizeof(args)); res = (struct sanlk_resource *)&rd; strcpy(res->lockspace_name, argv[1]); strcpy(res->name, argv[2]); res->num_disks = 1; strcpy(res->disks[0].path, argv[3]); res->disks[0].offset = atoi(argv[4]); strcpy(path, argv[5]); if (argc > 6) { for (i = 6; i < argc; i++) { strcat(args, argv[i]); strcat(args, " "); } } sock = sanlock_register(); if (sock < 0) { fprintf(stderr, "register error %d\n", sock); return -1; } if (!strcmp(path, "none")) goto acquire; rv = sanlock_killpath(sock, SANLK_KILLPATH_PID, path, args); if (rv < 0) { fprintf(stderr, "killpath error %d\n", rv); return -1; } acquire: rv = sanlock_acquire(sock, -1, 0, 1, &res, NULL); if (rv < 0) { fprintf(stderr, "acquire error %d\n", rv); return -1; } rv = sanlock_restrict(sock, SANLK_RESTRICT_ALL); if (rv < 0) { fprintf(stderr, "restrict error %d\n", rv); return -1; } printf("%d running\n", getpid()); last = time(NULL); while (1) { now = time(NULL); if (now - last > 2) printf("%d running (paused %llu sec)\n", getpid(), (unsigned long long)(now - last)); last = now; sleep(1); } } sanlock-3.8.2/tests/sanlk_events.c000066400000000000000000000041201371427612200171620ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "../src/sanlock_sock.h" static int prog_stop; static void sigterm_handler(int sig) { if (sig == SIGTERM) prog_stop = 1; } int main(int argc, char *argv[]) { struct sigaction act; struct sanlk_host_event he; struct pollfd pollfd; uint64_t from_host, from_gen; char *ls_name; int fd, rv; if (argc < 2) { printf("sanlk_events \n"); return -1; } memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; sigaction(SIGTERM, &act, NULL); ls_name = argv[1]; printf("reg_event %s\n", ls_name); fd = sanlock_reg_event(ls_name, &he, 0); if (fd < 0) { printf("reg error %d\n", fd); return -1; } memset(&pollfd, 0, sizeof(pollfd)); pollfd.fd = fd; pollfd.events = POLLIN; while (1) { rv = poll(&pollfd, 1, 1000); if (rv == -1 && errno == EINTR) continue; if (prog_stop) break; if (rv < 0) { printf("poll error %d\n", rv); break; } if (pollfd.revents & POLLIN) { while (1) { rv = sanlock_get_event(fd, 0, &he, &from_host, &from_gen); if (rv == -EAGAIN) { /* no more events */ break; } if (rv < 0) { printf("get_event error %d\n", rv); break; } printf("get_event host_id %llu generation %llu event 0x%llx data 0x%llx from %llu %llu\n", (unsigned long long)he.host_id, (unsigned long long)he.generation, (unsigned long long)he.event, (unsigned long long)he.data, (unsigned long long)from_host, (unsigned long long)from_gen); } } if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) { printf("poll revents %x\n", pollfd.revents); break; } } printf("end_event %s\n", ls_name); sanlock_end_event(fd, ls_name, 0); return 0; } sanlock-3.8.2/tests/sanlk_load.c000066400000000000000000000510141371427612200166010ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" #include "sanlock_direct.h" #define ONEMB 1048576 #define LEASE_SIZE ONEMB #define MAX_LS_COUNT 64 #define MAX_RES_COUNT 512 #define MAX_PID_COUNT 256 #define DEFAULT_LS_COUNT 4 #define DEFAULT_RES_COUNT 4 #define DEFAULT_PID_COUNT 4 #define MAX_RV 300 #define IV -1 #define UN 0 #define SH 3 #define EX 5 int prog_stop; int debug = 0; int debug_verbose = 0; char error_buf[4096]; char lock_disk_base[PATH_MAX]; int lock_state[MAX_LS_COUNT][MAX_RES_COUNT]; int ls_count = DEFAULT_LS_COUNT; int res_count = DEFAULT_RES_COUNT; int pid_count = DEFAULT_PID_COUNT; int one_mode = 0; int our_hostid; int run_sec; int count_sec; int error_count; int error_range = 1; int acquire_rv[MAX_RV]; int release_rv[MAX_RV]; #define log_debug(fmt, args...) \ do { \ if (debug) printf("%lu " fmt "\n", time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ memset(error_buf, 0, sizeof(error_buf)); \ snprintf(error_buf, 4095, "%ld " fmt "\n", time(NULL), ##args); \ printf("ERROR: %s\n", error_buf); \ syslog(LOG_ERR, "%s", error_buf); \ error_count++; \ } while (0) static void sigterm_handler(int sig) { if (sig == SIGTERM) prog_stop = 1; } static int get_rand(int a, int b) { return a + (int) (((float)(b - a + 1)) * random() / (RAND_MAX+1.0)); } static int get_rand_sh_ex(void) { unsigned int n; if (one_mode == SH) return SH; if (one_mode == EX) return EX; n = (unsigned int)random();; if (n % 2) return SH; return EX; } static void save_rv(int pid, int rv, int acquire) { if (rv > 0) goto fail; if (-rv > MAX_RV) goto fail; if (acquire) { if (!rv) acquire_rv[0]++; else acquire_rv[-rv]++; } else { if (!rv) release_rv[0]++; else release_rv[-rv]++; } if (error_range == 1) { switch (rv) { case 0: case -EBUSY: /* -16 */ case -EEXIST: /* -17 */ case -EAGAIN: /* -11 */ break; default: log_error("%d ERROR range %d save_rv %d %d", pid, error_range, rv, acquire); break; }; } else if (error_range == 2) { switch (rv) { case 0: case -EBUSY: /* -16 */ case -EEXIST: /* -17 */ case -EAGAIN: /* -11 */ break; case -243: case -244: case -245: break; default: log_error("%d ERROR range %d save_rv %d %d", pid, error_range, rv, acquire); break; }; } return; fail: log_error("%d save_rv %d %d", pid, rv, acquire); printf("%lu %d ERROR save_rv %d %d", time(NULL), pid, rv, acquire); } static void display_rv(int pid) { int i; printf("%lu %d results acquire ", time(NULL), pid); for (i = 0; i < MAX_RV; i++) { if (acquire_rv[i]) printf("%d:%d ", i, acquire_rv[i]); } printf("release "); for (i = 0; i < MAX_RV; i++) { if (release_rv[i]) printf("%d:%d ", i, release_rv[i]); } printf("\n"); } static void dump_lock_state(int pid) { int i, j; for (i = 0; i < ls_count; i++) { for (j = 0; j < res_count; j++) { if (!lock_state[i][j]) continue; log_error("%d lockspace%d:resource%d", pid, i, j); } } } static void dump_inquire_state(int pid, char *state) { char *p = state; int len = strlen(state); int i; if (!len) return; for (i = 0; i < len; i++) { if (state[i] == ' ') { state[i] = '\0'; if (!i) log_debug("%d leading space", pid); else log_debug("%d %s", pid, p); p = state + i + 1; } } log_debug("%d %s", pid, p); } static int check_lock_state(int pid, int result, int count, char *res_state) { char buf[128]; char *found = NULL; int found_count = 0; int none_count = 0; int bad_count = 0; int i, j; memset(buf, 0, sizeof(buf)); if (result < 0) goto fail; if (!count) { if (res_state) { log_error("%d check_lock_state zero count res_state %s", pid, res_state); } for (i = 0; i < ls_count; i++) { for (j = 0; j < res_count; j++) { if (lock_state[i][j]) { bad_count++; log_error("%d check_lock_state zero count %d %d lock", pid, i, j); } } } if (bad_count) goto fail; return 0; } for (i = 0; i < ls_count; i++) { for (j = 0; j < res_count; j++) { memset(buf, 0, sizeof(buf)); sprintf(buf, "lockspace%d:resource%d:", i, j); found = strstr(res_state, buf); if (found && lock_state[i][j]) { found_count++; } else if (!found && !lock_state[i][j]) { none_count++; } else { bad_count++; log_error("%d check_lock_state %s lock_state %d res_state %s", pid, buf, lock_state[i][j], res_state); } } } if ((found_count != count) || bad_count) goto fail; return 0; fail: log_error("%d check_lock_state result %d count %d res_state %s", pid, result, count, res_state); log_error("%d check_lock_state found %d none %d bad %d", pid, found_count, none_count, bad_count); dump_lock_state(pid); printf("%lu %d ERROR check_lock_state result %d count %d found %d bad %d res_state %s", time(NULL), pid, result, count, found_count, bad_count, res_state); return -1; } #if 0 static int remove_lockspace(int i) { struct sanlk_lockspace ls; int rv; memset(&ls, 0, sizeof(ls)); sprintf(ls.host_id_disk.path, "%s%d", lock_disk_base, i); sprintf(ls.name, "lockspace%d", i); ls.host_id = our_hostid; printf("rem lockspace%d...\n", i); rv = sanlock_rem_lockspace(&ls, 0); if (rv < 0) { log_error("sanlock_rem_lockspace error %d %s", rv, ls.host_id_disk.path); return -1; } printf("rem done\n"); return 0; } #endif static int add_lockspace(int i) { struct sanlk_lockspace ls; int rv; int async = !(i % 2); uint32_t flags = 0; memset(&ls, 0, sizeof(ls)); sprintf(ls.host_id_disk.path, "%s%d", lock_disk_base, i); sprintf(ls.name, "lockspace%d", i); ls.host_id = our_hostid; if (async) flags = SANLK_ADD_ASYNC; printf("add lockspace%d...\n", i); rv = sanlock_add_lockspace(&ls, flags); if (rv == -EEXIST) return 0; if (rv < 0) { log_error("sanlock_add_lockspace error %d %s", rv, ls.host_id_disk.path); return -1; } if (!async) goto out; while (1) { rv = sanlock_inq_lockspace(&ls, 0); if (!rv) goto out; if (rv == -EINPROGRESS) { sleep(2); continue; } log_error("sanlock_inq_lockspace error %d", rv); return -1; } out: printf("add done\n"); return 0; } static int add_lockspaces(void) { int i, rv; for (i = 0; i < ls_count; i++) { rv = add_lockspace(i); if (rv < 0) return rv; } return 0; } static const char *mode_str(int n) { if (n == SH) return "sh"; if (n == EX) return "ex"; if (n == UN) return "un"; if (n == IV) return "iv"; return "er"; } static int do_one(int pid, int fd, int _s1, int _r1, int _n1, int *full) { char buf1[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *r1; int acquire = (_n1 != UN); int rv; memset(buf1, 0, sizeof(buf1)); r1 = (struct sanlk_resource *)&buf1; sprintf(r1->lockspace_name, "lockspace%d", _s1); sprintf(r1->name, "resource%d", _r1); sprintf(r1->disks[0].path, "%s%d", lock_disk_base, _s1); r1->disks[0].offset = (_r1+1)*LEASE_SIZE; r1->num_disks = 1; if (_n1 == SH) r1->flags |= SANLK_RES_SHARED; if (acquire) { rv = sanlock_acquire(fd, -1, 0, 1, &r1, NULL); if (rv == -E2BIG || rv == -ENOENT) *full = 1; } else { rv = sanlock_release(fd, -1, 0, 1, &r1); } log_debug("%d %s %d,%d %s = %d", pid, acquire ? "acquire" : "release", _s1, _r1, mode_str(_n1), rv); save_rv(pid, rv, acquire); return rv; } static int do_two(int pid, int fd, int _s1, int _r1, int _n1, int _s2, int _r2, int _n2, int *full) { char buf1[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; char buf2[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *r1; struct sanlk_resource *r2; struct sanlk_resource **res_args; int acquire = (_n1 != UN); int rv; res_args = malloc(2 * sizeof(struct sanlk_resource *)); if (!res_args) return -ENOMEM; memset(buf1, 0, sizeof(buf1)); memset(buf2, 0, sizeof(buf2)); r1 = (struct sanlk_resource *)&buf1; r2 = (struct sanlk_resource *)&buf2; res_args[0] = r1; res_args[1] = r2; sprintf(r1->lockspace_name, "lockspace%d", _s1); sprintf(r1->name, "resource%d", _r1); sprintf(r1->disks[0].path, "%s%d", lock_disk_base, _s1); r1->disks[0].offset = (_r1+1)*LEASE_SIZE; r1->num_disks = 1; if (_n1 == SH) r1->flags |= SANLK_RES_SHARED; sprintf(r2->lockspace_name, "lockspace%d", _s2); sprintf(r2->name, "resource%d", _r2); sprintf(r2->disks[0].path, "%s%d", lock_disk_base, _s2); r2->disks[0].offset = (_r2+1)*LEASE_SIZE; r2->num_disks = 1; if (_n2 == SH) r2->flags |= SANLK_RES_SHARED; if (acquire) { rv = sanlock_acquire(fd, -1, 0, 2, res_args, NULL); if (rv == -E2BIG || rv == -ENOENT) *full = 1; } else { rv = sanlock_release(fd, -1, 0, 2, res_args); } log_debug("%d %s %d,%d %s %d,%d %s = %d", pid, acquire ? "acquire" : "release", _s1, _r1, mode_str(_n1), _s2, _r2, mode_str(_n2), rv); save_rv(pid, rv, acquire); free(res_args); return rv; } static int acquire_one(int pid, int fd, int s1, int r1, int n1, int *full) { return do_one(pid, fd, s1, r1, n1, full); } static int acquire_two(int pid, int fd, int s1, int r1, int n1, int s2, int r2, int n2, int *full) { return do_two(pid, fd, s1, r1, n1, s2, r2, n2, full); } static int release_one(int pid, int fd, int s1, int r1) { return do_one(pid, fd, s1, r1, UN, NULL); } static int release_two(int pid, int fd, int s1, int r1, int s2, int r2) { return do_two(pid, fd, s1, r1, UN, s2, r2, UN, NULL); } static int release_all(int pid, int fd) { int rv; rv = sanlock_release(fd, -1, SANLK_REL_ALL, 0, NULL); log_debug("%d release all = %d", pid, rv); save_rv(pid, rv, 0); return rv; } static void inquire_all(int pid, int fd) { int rv, count = 0; char *state = NULL; if (prog_stop) return; rv = sanlock_inquire(fd, -1, 0, &count, &state); log_debug("%d inquire all = %d %d", pid, rv, count); if (prog_stop) return; check_lock_state(pid, rv, count, state); if (count && debug_verbose) dump_inquire_state(pid, state); if (state) free(state); } int do_rand_child(void) { int s1, s2, r1, r2, m1, m2, n1, n2, full; int fd, rv; int iter = 1; int pid = getpid(); error_count = 0; srandom(pid); memset(lock_state, 0, sizeof(lock_state)); fd = sanlock_register(); if (fd < 0) { log_error("%d sanlock_register error %d", pid, fd); exit(-1); } while (!prog_stop) { s1 = get_rand(0, ls_count-1); r1 = get_rand(0, res_count-1); m1 = lock_state[s1][r1]; s2 = -1; r2 = -1; m2 = IV; if (get_rand(1, 3) == 2) { s2 = get_rand(0, ls_count-1); r2 = get_rand(0, res_count-1); m2 = lock_state[s2][r2]; if (s1 == s2 && r1 == r2) { s2 = -1; r2 = -1; m2 = IV; } } full = 0; if (m1 == UN && m2 == UN) { /* both picks are unlocked, lock both together */ n1 = get_rand_sh_ex(); n2 = get_rand_sh_ex(); rv = acquire_two(pid, fd, s1, r1, n1, s2, r2, n2, &full); if (!rv) { lock_state[s1][r1] = n1; lock_state[s2][r2] = n2; } m1 = IV; m2 = IV; } if (m1 > UN && m2 > UN) { /* both picks are locked, unlock both together */ release_two(pid, fd, s1, r1, s2, r2); lock_state[s1][r1] = UN; lock_state[s2][r2] = UN; m1 = IV; m2 = IV; } if (m1 == UN) { n1 = get_rand_sh_ex(); rv = acquire_one(pid, fd, s1, r1, n1, &full); if (!rv) lock_state[s1][r1] = n1; } if (m2 == UN) { n2 = get_rand_sh_ex(); rv = acquire_one(pid, fd, s2, r2, n2, &full); if (!rv) lock_state[s2][r2] = n2; } if (m1 > UN) { release_one(pid, fd, s1, r1); lock_state[s1][r1] = UN; } if (m2 > UN) { release_one(pid, fd, s2, r2); lock_state[s2][r2] = UN; } if (full) { release_all(pid, fd); memset(lock_state, 0, sizeof(lock_state)); } if ((iter % 10) == 0) { display_rv(pid); inquire_all(pid, fd); } iter++; } display_rv(pid); if (error_count) { printf("pid %d done error_count %d\n", pid, error_count); exit(EXIT_FAILURE); } printf("pid %d done\n", pid); exit(EXIT_SUCCESS); } int do_all_child(void) { int sx, rx, full; int fd, rv; int pid = getpid(); srandom(pid); memset(lock_state, 0, sizeof(lock_state)); fd = sanlock_register(); if (fd < 0) { log_error("%d sanlock_register error %d", pid, fd); exit(-1); } while (!prog_stop) { for (sx = 0; sx < ls_count; sx++) { for (rx = 0; rx < res_count; rx++) { rv = acquire_one(pid, fd, sx, rx, EX, &full); if (!rv) lock_state[sx][rx] = EX; } inquire_all(pid, fd); for (rx = 0; rx < res_count-1; rx++) { rv = release_one(pid, fd, sx, rx); lock_state[sx][rx] = UN; } inquire_all(pid, fd); } } display_rv(pid); return 0; } /* * sanlk_load rand -i [-D -s -r -p ] */ void get_options(int argc, char *argv[]) { char optchar; char *optionarg; char *p; int i = 3; for (; i < argc; ) { p = argv[i]; if ((p[0] != '-') || (strlen(p) != 2)) { log_error("unknown option %s", p); log_error("space required before option value"); exit(EXIT_FAILURE); } optchar = p[1]; i++; if (optchar == 'D') { debug = 1; continue; } if (optchar == 'V') { debug_verbose = 1; continue; } if (i >= argc) { log_error("option '%c' requires arg", optchar); exit(EXIT_FAILURE); } optionarg = argv[i]; switch (optchar) { case 'i': our_hostid = atoi(optionarg); break; case 'S': run_sec = atoi(optionarg); break; case 's': ls_count = atoi(optionarg); if (ls_count > MAX_LS_COUNT) { log_error("max ls_count %d", MAX_LS_COUNT); exit(-1); } break; case 'r': res_count = atoi(optionarg); if (res_count > MAX_RES_COUNT) { log_error("max res_count %d", MAX_RES_COUNT); exit(-1); } break; case 'p': pid_count = atoi(optionarg); if (pid_count > MAX_PID_COUNT) { log_error("max pid_count %d", MAX_PID_COUNT); exit(-1); } break; case 'm': one_mode = atoi(optionarg); break; case 'e': error_range = atoi(optionarg); break; default: log_error("unknown option: %c", optchar); exit(EXIT_FAILURE); } i++; } } int find_pid(int *kids, int pid) { int i; for (i = 0; i < pid_count; i++) { if (kids[i] == pid) return i; } return -1; } int do_rand(int argc, char *argv[]) { struct sigaction act; int children[MAX_PID_COUNT]; int run_count = 0; int i, rv, pid, status; if (argc < 5) return -1; memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; sigaction(SIGTERM, &act, NULL); strcpy(lock_disk_base, argv[2]); get_options(argc, argv); rv = add_lockspaces(); if (rv < 0) return rv; printf("forking %d pids\n", pid_count); for (i = 0; i < pid_count; i++) { pid = fork(); if (pid < 0) { log_error("fork %d failed %d run_count %d", i, errno, run_count); break; } if (!pid) { do_rand_child(); exit(-1); } children[i] = pid; run_count++; } printf("children running\n"); while (!prog_stop) { #if 0 /* * kill and replace a random pid */ sleep(get_rand(1, 60)); if (prog_stop) break; i = get_rand(0, pid_count); pid = children[i]; printf("kill pid %d\n", pid); kill(pid, SIGKILL); rv = waitpid(pid, &status, 0); if (rv <= 0) continue; pid = fork(); if (pid < 0) { log_error("fork failed %d", errno); break; } else if (!pid) { do_rand_child(); exit(-1); } else { children[i] = pid; } #endif #if 0 /* * remove a random lockspace, replace any pids that were using * it, replace the lockspace */ sleep(get_rand(1, 60)); if (prog_stop) break; lsi = get_rand(0, ls_count-1); remove_lockspace(lsi); while (1) { rv = waitpid(-1, &status, WNOHANG); if (rv <= 0) break; if (!WIFEXITED(status)) continue; printf("exit pid %d\n", pid); i = find_pid(children, rv); if (i < 0) continue; pid = fork(); if (pid < 0) { log_error("fork failed %d", errno); break; } else if (!pid) { do_rand_child(); exit(-1); } else { children[i] = pid; } } add_lockspace(lsi); #endif if (run_sec && (count_sec >= run_sec)) break; count_sec++; sleep(1); } printf("stopping pids "); for (i = 0; i < pid_count; i++) kill(children[i], SIGTERM); while (run_count) { status = 0; pid = wait(&status); if (pid > 0) { run_count--; if (!WIFEXITED(status)) { error_count++; printf("-"); } else if (WEXITSTATUS(status)) { error_count++; printf("x"); } else { printf("."); } } } printf("\n"); if (error_count) { printf("child errors %d\n", error_count); exit(EXIT_FAILURE); } return 0; } int do_all(int argc, char *argv[]) { struct sigaction act; int children[MAX_PID_COUNT]; int run_count = 0; int i, rv, pid, status; if (argc < 5) return -1; memset(&act, 0, sizeof(act)); act.sa_handler = sigterm_handler; sigaction(SIGTERM, &act, NULL); strcpy(lock_disk_base, argv[2]); get_options(argc, argv); rv = add_lockspaces(); if (rv < 0) return rv; printf("forking %d pids\n", pid_count); for (i = 0; i < pid_count; i++) { pid = fork(); if (pid < 0) { log_error("fork %d failed %d run_count %d", i, errno, run_count); break; } if (!pid) { do_all_child(); exit(-1); } children[i] = pid; run_count++; } printf("children running\n"); while (!prog_stop) { sleep(1); } printf("stopping pids"); for (i = 0; i < pid_count; i++) kill(children[i], SIGTERM); while (run_count) { pid = wait(&status); if (pid > 0) { run_count--; printf("."); } } printf("\n"); if (error_count) { printf("error_count %d\n", error_count); exit(EXIT_FAILURE); } return 0; } /* * sanlk_load init [ ] * lock_disk_base = /dev/vg/foo * * sanlock direct init -s lockspace0:0:/dev/vg/foo0:0 * sanlock direct init -r lockspace0:resource0:/dev/vg/foo0:1M * sanlock direct init -r lockspace0:resource1:/dev/vg/foo0:2M * ... * sanlock direct init -s lockspace1:0:/dev/vg/foo1:0 * sanlock direct init -r lockspace1:resource0:/dev/vg/foo1:1M * sanlock direct init -r lockspace1:resource1:/dev/vg/foo1:2M * ... */ #define INIT_NUM_HOSTS 64 int do_init(int argc, char *argv[]) { char resbuf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *res; struct sanlk_lockspace ls; int i, j, rv; if (argc < 3) return -1; strcpy(lock_disk_base, argv[2]); if (argc > 3) ls_count = atoi(argv[3]); if (argc > 4) res_count = atoi(argv[4]); for (i = 0; i < ls_count; i++) { memset(&ls, 0, sizeof(ls)); sprintf(ls.host_id_disk.path, "%s%d", lock_disk_base, i); sprintf(ls.name, "lockspace%d", i); rv = sanlock_direct_init(&ls, NULL, 0, INIT_NUM_HOSTS, 1); if (rv < 0) { printf("sanlock_direct_init lockspace error %d %s\n", rv, ls.host_id_disk.path); return -1; } for (j = 0; j < res_count; j++) { memset(resbuf, 0, sizeof(resbuf)); res = (struct sanlk_resource *)&resbuf; strcpy(res->lockspace_name, ls.name); sprintf(res->name, "resource%d", j); res->num_disks = 1; strcpy(res->disks[0].path, ls.host_id_disk.path); res->disks[0].offset = (j+1)*LEASE_SIZE; rv = sanlock_direct_init(NULL, res, 0, INIT_NUM_HOSTS, 0); if (rv < 0) { printf("sanlock_direct_init resource error %d\n", rv); return -1; } } } return 0; } int main(int argc, char *argv[]) { int rv = -1; if (argc < 2) goto out; if (!strcmp(argv[1], "init")) rv = do_init(argc, argv); else if (!strcmp(argv[1], "rand")) rv = do_rand(argc, argv); else if (!strcmp(argv[1], "all")) rv = do_all(argc, argv); if (!rv) return 0; out: printf("sanlk_load init [ ]\n"); printf(" init ls_count lockspaces, each with res_count resources\n"); printf(" devices for lockspaces 0..N are disk_base0..disk_baseN\n"); printf(" e.g. /dev/lock0, /dev/lock1, ... /dev/lockN\n"); printf("\n"); printf("sanlk_load rand -i [options]\n"); printf(" -s number of lockspaces\n"); printf(" -r number of resources per lockspace\n"); printf(" -p number of processes\n"); printf(" -m use one mode for all locks, 3 = SH, 5 = EX\n"); printf(" -S seconds to run (0 unlimited)\n"); printf(" -e error range expected (1 single node, 2 multi node)\n"); printf(" -D debug output\n"); printf(" -V verbose debug output\n"); printf("\n"); return -1; } sanlock-3.8.2/tests/sanlk_lockr.c000066400000000000000000000026251371427612200170000ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" /* gcc with -lsanlock */ int main(int argc, char *argv[]) { char rd[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *res; int sleep_sec; int fd, rv; if (argc < 6) { printf("acquire, [sleep], release\n"); printf("sanlk_lockr \n"); return -1; } memset(rd, 0, sizeof(rd)); res = (struct sanlk_resource *)&rd; strcpy(res->lockspace_name, argv[1]); strcpy(res->name, argv[2]); res->num_disks = 1; strcpy(res->disks[0].path, argv[3]); res->disks[0].offset = atoi(argv[4]); sleep_sec = atoi(argv[5]); fd = sanlock_register(); if (fd < 0) { fprintf(stderr, "register error %d\n", fd); return -1; } rv = sanlock_acquire(fd, -1, 0, 1, &res, NULL); if (rv < 0) { fprintf(stderr, "acquire error %d\n", rv); return -1; } if (sleep_sec) sleep(sleep_sec); rv = sanlock_release(fd, -1, 0, 1, &res); if (rv < 0) { fprintf(stderr, "release error %d\n", rv); return -1; } return 0; } sanlock-3.8.2/tests/sanlk_lvb.c000066400000000000000000000045621371427612200164530ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" /* gcc with -lsanlock */ int main(int argc, char *argv[]) { char rd[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *res; char lvb[512]; char *filename; char *act; FILE *fp; int fd, rv; int set = 0, get = 0; memset(lvb, 0, sizeof(lvb)); if (argc < 7) { printf("read file, write it to lvb:\n"); printf("sanlk_lvb set \n"); printf("\n"); printf("read lvb, write it to file:\n"); printf("sanlk_lvb get \n"); return -1; } memset(rd, 0, sizeof(rd)); res = (struct sanlk_resource *)&rd; act = argv[1]; strcpy(res->lockspace_name, argv[2]); strcpy(res->name, argv[3]); res->num_disks = 1; strcpy(res->disks[0].path, argv[4]); res->disks[0].offset = atoi(argv[5]); filename = argv[6]; if (!strcmp(act, "set")) { set = 1; fp = fopen(filename, "r"); } else if (!strcmp(act, "get")) { get = 1; fp = fopen(filename, "w"); } else { printf("bad action %s\n", act); return -1; } if (!fp) { printf("fopen failed %s\n", strerror(errno)); return -1; } fd = sanlock_register(); if (fd < 0) { printf("register error %d\n", fd); return -1; } rv = sanlock_acquire(fd, -1, SANLK_ACQUIRE_LVB, 1, &res, NULL); if (rv < 0) { printf("acquire error %d\n", rv); return -1; } if (get) { rv = sanlock_get_lvb(0, res, lvb, sizeof(lvb)); if (rv < 0) { printf("get_lvb error %d\n", rv); return -1; } fwrite(lvb, sizeof(lvb), 1, fp); if (ferror(fp)) { printf("fwrite error\n"); return -1; } } if (set) { fread(lvb, sizeof(lvb), 1, fp); if (ferror(fp)) { printf("fread error\n"); return -1; } rv = sanlock_set_lvb(0, res, lvb, sizeof(lvb)); if (rv < 0) { printf("set_lvb error %d\n", rv); return -1; } } fclose(fp); rv = sanlock_release(fd, -1, 0, 1, &res); if (rv < 0) { fprintf(stderr, "release error %d\n", rv); return -1; } return 0; } sanlock-3.8.2/tests/sanlk_path.c000066400000000000000000000103231371427612200166140ustar00rootroot00000000000000#include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" #define DSTMAXSIZE 1024 static int __test_passed = 0; #define check_perror(expression, fmt, args...) \ if (expression) { \ __test_passed++; \ } \ else { \ fprintf(stderr, "%s:%i " fmt "\n", __FILE__, __LINE__, ##args); \ exit(1); \ } void test_sanlock_path_export(void) { int rv, dst_len; char dst_str[DSTMAXSIZE]; const char *src_str, *dst_exp; /* regular behavior, no escapes */ src_str = "Hello World"; dst_exp = src_str; dst_len = strlen(dst_exp); memset(dst_str, 'X', DSTMAXSIZE); /* destination too short */ rv = sanlock_path_export(dst_str, src_str, dst_len); check_perror(rv == 0, "sanlock_path_export wrong return code: %u", rv); check_perror(dst_str[dst_len] == 'X', "sanlock_path_export buffer overflow"); /* destination long enough */ rv = sanlock_path_export(dst_str, src_str, dst_len + 1); check_perror(rv == dst_len, "sanlock_path_export wrong return code: %u", rv); check_perror(dst_str[dst_len] == '\0', "sanlock_path_import destination not terminated"); check_perror(!strncmp(dst_str, dst_exp, dst_len), "sanlock_path_export destination is different"); /* special behavior, escapes */ src_str = "Hello World:"; dst_exp = "Hello World\\:"; dst_len = strlen(dst_exp); memset(dst_str, 'X', DSTMAXSIZE); /* destination too short */ rv = sanlock_path_export(dst_str, src_str, dst_len); check_perror(rv == 0, "sanlock_path_export wrong return code: %u", rv); check_perror(dst_str[dst_len] == 'X', "sanlock_path_export buffer overflow"); /* destination long enough */ rv = sanlock_path_export(dst_str, src_str, dst_len + 1); check_perror(rv == dst_len, "sanlock_path_export wrong return code: %u", rv); check_perror(dst_str[dst_len] == '\0', "sanlock_path_import destination not terminated"); check_perror(!strncmp(dst_str, dst_exp, dst_len), "sanlock_path_export destination is different"); } void test_sanlock_path_import(void) { int rv, dst_len; char dst_str[DSTMAXSIZE]; const char *src_str, *dst_exp; /* regular behavior, no escapes */ src_str = "Hello World"; dst_exp = src_str; dst_len = strlen(dst_exp); memset(dst_str, 'X', DSTMAXSIZE); /* destination too short */ rv = sanlock_path_import(dst_str, src_str, dst_len); check_perror(rv == 0, "sanlock_path_import wrong return code: %u", rv); check_perror(dst_str[dst_len] == 'X', "sanlock_path_import buffer overflow"); /* destination long enough */ rv = sanlock_path_import(dst_str, src_str, dst_len + 1); check_perror(rv == dst_len, "sanlock_path_import wrong return code: %u", rv); check_perror(dst_str[dst_len] == '\0', "sanlock_path_import destination not terminated"); check_perror(!strncmp(dst_str, dst_exp, dst_len), "sanlock_path_import destination is different"); /* special behavior, escapes */ src_str = "Hello World\\:"; dst_exp = "Hello World:"; dst_len = strlen(dst_exp); memset(dst_str, 'X', DSTMAXSIZE); /* destination too short */ rv = sanlock_path_import(dst_str, src_str, dst_len); check_perror(rv == 0, "sanlock_path_import wrong return code: %u", rv); check_perror(dst_str[dst_len] == 'X', "sanlock_path_import buffer overflow"); /* destination long enough */ rv = sanlock_path_import(dst_str, src_str, dst_len + 1); check_perror(rv == dst_len, "sanlock_path_import wrong return code: %u", rv); check_perror(dst_str[dst_len] == '\0', "sanlock_path_import destination not terminated"); check_perror(!strncmp(dst_str, dst_exp, dst_len), "sanlock_path_import destination is different"); } int main(int argc __attribute__ ((unused)), char *argv[] __attribute__ ((unused))) { test_sanlock_path_export(); test_sanlock_path_import(); printf("OK, %i tests sucessfully passed.\n", __test_passed); return 0; } sanlock-3.8.2/tests/sanlk_rename.c000066400000000000000000000033041371427612200171300ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" /* gcc with -lsanlock */ int main(int argc, char *argv[]) { char rd[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; char rd2[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; struct sanlk_resource *res; struct sanlk_resource *res2; struct sanlk_resource **res_args; int fd, rv; if (argc < 6) { printf("acquire with old name, release with new name\n"); printf("sanlk_rename \n"); return -1; } res_args = malloc(2 * sizeof(struct sanlk_resource *)); memset(rd, 0, sizeof(rd)); memset(rd2, 0, sizeof(rd2)); res = (struct sanlk_resource *)&rd; res2 = (struct sanlk_resource *)&rd2; res_args[0] = res; res_args[1] = res2; strcpy(res->lockspace_name, argv[1]); strcpy(res->name, argv[2]); strcpy(res2->name, argv[3]); res->num_disks = 1; strcpy(res->disks[0].path, argv[4]); res->disks[0].offset = atoi(argv[5]); fd = sanlock_register(); if (fd < 0) { fprintf(stderr, "register error %d\n", fd); return -1; } rv = sanlock_acquire(fd, -1, 0, 1, &res, NULL); if (rv < 0) { fprintf(stderr, "acquire error %d\n", rv); return -1; } rv = sanlock_release(fd, -1, SANLK_REL_RENAME, 2, res_args); if (rv < 0) { fprintf(stderr, "release error %d\n", rv); return -1; } return 0; } sanlock-3.8.2/tests/sanlk_string.c000066400000000000000000000063401371427612200171720ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_resource.h" void print_res(struct sanlk_resource *res) { int i; printf("struct fields: \"%s\" \"%s\"", res->lockspace_name, res->name); for (i = 0; i < res->num_disks; i++) { printf(" \"%s\" %llu", res->disks[i].path, (unsigned long long)res->disks[i].offset); } printf(" flags %x", res->flags); printf(" lver %llu\n", (unsigned long long)res->lver); } int main(int argc, char *argv[]) { struct sanlk_lockspace ls; struct sanlk_resource *res; struct sanlk_resource **res_args = NULL; char *state; int res_count; int rv, i; if (argc < 2) { printf("%s RESOURCE RESOURCE ...\n", argv[0]); printf("%s -s LOCKSPACE\n", argv[0]); return 0; } if (!strcmp(argv[1], "-s")) { memset(&ls, 0, sizeof(ls)); rv = sanlock_str_to_lockspace(argv[2], &ls); printf("struct fields: \"%s\" %llu %u \"%s\" %llu\n", ls.name, (unsigned long long)ls.host_id, ls.flags, ls.host_id_disk.path, (unsigned long long)ls.host_id_disk.offset); return rv; } state = malloc(1024 * 1024); memset(state, 0, 1024 * 1024); printf("\n"); printf("sanlock_str_to_res for each argv\n", rv); printf("--------------------------------------------------------------------------------\n"); for (i = 1; i < argc; i++) { rv = sanlock_str_to_res(argv[i], &res); print_res(res); free(res); res = NULL; if (i > 1) strcat(state, " "); strcat(state, argv[i]); } printf("\n"); printf("combined argv input for state_to_args\n"); printf("--------------------------------------------------------------------------------\n"); printf("\"%s\"\n", state); rv = sanlock_state_to_args(state, &res_count, &res_args); printf("\n"); printf("sanlock_state_to_args %d res_count %d\n", rv, res_count); printf("--------------------------------------------------------------------------------\n"); for (i = 0; i < res_count; i++) { res = res_args[i]; print_res(res); } free(state); state = NULL; rv = sanlock_args_to_state(res_count, res_args, &state); printf("\n"); printf("sanlock_args_to_state %d\n", rv); printf("--------------------------------------------------------------------------------\n"); printf("\"%s\"\n", state); return 0; } #if 0 [root@bull-02 tests]# ./res_string 'LA:R1:/dev/foo1\:xx:0:/dev/foo2\:yy:0' 'LB:R2:/dev/bar:11' sanlock_str_to_res for each argv -------------------------------------------------------------------------------- struct fields: "LA" "R1" "/dev/foo1:xx" 0 "/dev/foo2:yy" 0 0 struct fields: "LB" "R2" "/dev/bar" 11 0 combined argv input for state_to_args -------------------------------------------------------------------------------- "LA:R1:/dev/foo1\:xx:0:/dev/foo2\:yy:0 LB:R2:/dev/bar:11" sanlock_state_to_args 0 res_count 2 -------------------------------------------------------------------------------- struct fields: "LA" "R1" "/dev/foo1:xx" 0 "/dev/foo2:yy" 0 0 struct fields: "LB" "R2" "/dev/bar" 11 0 sanlock_args_to_state 0 -------------------------------------------------------------------------------- "LA:R1:/dev/foo1\:xx:0:/dev/foo2\:yy:0:0 LB:R2:/dev/bar:11:0" #endif sanlock-3.8.2/tests/sanlk_testr.c000066400000000000000000000041171371427612200170250ustar00rootroot00000000000000#include #include #include #include #include #include #include #include "sanlock.h" #include "sanlock_admin.h" #include "sanlock_resource.h" void print_res(struct sanlk_resource *res) { int i; printf("struct fields: \"%s\" \"%s\"", res->lockspace_name, res->name); for (i = 0; i < res->num_disks; i++) { printf(" \"%s\" %llu", res->disks[i].path, (unsigned long long)res->disks[i].offset); } printf(" flags %x", res->flags); printf(" lver %llu\n", (unsigned long long)res->lver); } int main(int argc, char *argv[]) { struct sanlk_resource *res = NULL; struct sanlk_host *hosts = NULL; struct sanlk_host *owners = NULL; struct sanlk_host *host, *owner; int hosts_count = 0; int owners_count = 0; uint32_t test_flags = 0; int i, rv; if (argc < 2) { printf("%s RESOURCE\n", argv[0]); return 0; } rv = sanlock_str_to_res(argv[1], &res); if (rv < 0) { printf("str_to_res %d\n", rv); goto out; } rv = sanlock_get_hosts(res->lockspace_name, 0, &hosts, &hosts_count, 0); if (rv < 0) { printf("get_hosts %d\n", rv); goto out; } rv = sanlock_read_resource_owners(res, 0, &owners, &owners_count); if (rv < 0) { printf("read_resource_owners %d\n", rv); goto out; } rv = sanlock_test_resource_owners(res, 0, owners, owners_count, hosts, hosts_count, &test_flags); if (rv < 0) { printf("test_resource_owners %d\n", rv); goto out; } printf("lockspace hosts:\n"); host = hosts; for (i = 0; i < hosts_count; i++) { printf("host %llu gen %llu state %u\n", (unsigned long long)host->host_id, (unsigned long long)host->generation, host->flags & SANLK_HOST_MASK); host++; } printf("resource owners:\n"); owner = owners; for (i = 0; i < owners_count; i++) { printf("owner %llu gen %llu\n", (unsigned long long)owner->host_id, (unsigned long long)owner->generation); owner++; } printf("test_flags %x\n", test_flags); out: if (res) free(res); if (hosts) free(hosts); if (owners) free(owners); return 0; } sanlock-3.8.2/tests/storage.py000066400000000000000000000072151371427612200163500ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ storage - provide storage for sanlock tests. """ import argparse import errno import logging import os import subprocess BASE_DIR = "/var/tmp/sanlock-storage" MOUNTPOINT = os.path.join(BASE_DIR, "mnt") # For testing sanlock with 4k block device. LOOP1 = os.path.join(BASE_DIR, "loop1") BACKING1 = os.path.join(BASE_DIR, "backing1") # For testing sanlock with a filesystem backed by 4k block device. LOOP2 = os.path.join(BASE_DIR, "loop2") BACKING2 = os.path.join(BASE_DIR, "backing2") # Test paths. BLOCK = LOOP1 FILE = os.path.join(MOUNTPOINT, "file") log = logging.getLogger("storage") def main(): parser = argparse.ArgumentParser( description='Storage helper for sanlock tests') parser.add_argument("command", choices=["setup", "teardown"]) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="storage: %(message)s") if args.command == "setup": setup() elif args.command == "teardown": teardown() def setup(): create_dir(BASE_DIR) if not os.path.exists(LOOP1): create_loop_device(LOOP1, BACKING1) if not os.path.exists(LOOP2): create_loop_device(LOOP2, BACKING2) create_dir(MOUNTPOINT) create_filesystem(LOOP2, MOUNTPOINT) # Sanlock allocates spaces as needed. with open(FILE, "wb") as f: f.truncate(0) def teardown(): if is_mounted(MOUNTPOINT): remove_filesystem(MOUNTPOINT) if os.path.exists(LOOP2): remove_loop_device(LOOP2, BACKING2) if os.path.exists(LOOP1): remove_loop_device(LOOP1, BACKING1) def create_loop_device(link_path, backing_file, size=1024**3, sector_size=4096): log.info("Creating loop device %s", link_path) with open(backing_file, "wb") as f: f.truncate(size) out = subprocess.check_output([ "sudo", "losetup", "-f", backing_file, "--sector-size", str(sector_size), "--show", ]) device = out.decode("utf-8").strip() # Remove stale symlink. if os.path.islink(link_path): os.unlink(link_path) os.symlink(device, link_path) chown(link_path) def remove_loop_device(link_path, backing_file): log.info("Removing loop device %s", link_path) subprocess.check_call(["sudo", "losetup", "-d", link_path]) remove_file(link_path) remove_file(backing_file) def create_filesystem(device, mountpoint): log.info("Creating filesystem %s", mountpoint) subprocess.check_call(["sudo", "mkfs.xfs", "-q", device]) subprocess.check_call(["sudo", "mount", device, mountpoint]) chown(mountpoint) def remove_filesystem(mountpoint): log.info("Removing filesystem %s", mountpoint) subprocess.check_call(["sudo", "umount", mountpoint]) def is_mounted(mountpoint): with open("/proc/self/mounts") as f: for line in f: if mountpoint in line: return True return False def chown(path): user_group = "%(USER)s:%(USER)s" % os.environ subprocess.check_call(["sudo", "chown", user_group, path]) def create_dir(path): try: os.makedirs(path) except EnvironmentError as e: if e.errno != errno.EEXIST: raise def remove_file(path): try: os.remove(path) except EnvironmentError as e: if e.errno != errno.ENOENT: raise if __name__ == "__main__": main() sanlock-3.8.2/tests/test-recovery.sh000077500000000000000000000126321371427612200175030ustar00rootroot00000000000000#!/bin/bash # # recovery tests based on 10 sec io timeout # dev=$1 echo test lockspace storage loss, recovery by lease release using killpath echo messages: sanlock check lease warn/fail, kill 100, all pids clear echo messages: wdmd warn, close, fail echo messages: killpath_pause date set -x ./clientn 4 start $dev 1 /root/killpath_pause sleep 5 ./clientn 4 error $dev sleep 150 ./clientn 4 resume $dev 1 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test lockspace storage loss, recovery by escalation from killpath to sigkill echo messages: sanlock check lease warn/fail, kill 100, kill 9, dead, all pids clear echo messages: wdmd warn, close, fail echo messages: killpath_args date set -x ./clientn 4 start $dev 1 /root/killpath_args sleep 5 ./clientn 4 error $dev sleep 150 ./clientn 4 linear $dev 1 sleep 5 set +x echo test lockspace storage loss, recovery by pid exit using killpath echo messages: sanlock check lease warn/fail, kill 100, dead, all pids clear echo messages: wdmd warn, close, fail echo messages: killpath_term date set -x ./clientn 4 start $dev 1 /root/killpath_term sleep 5 ./clientn 4 error $dev sleep 150 ./clientn 4 linear $dev 1 sleep 5 set +x echo test lockspace storage loss, recovery by pid sigterm without killpath echo messages: sanlock check lease warn/fail, kill 15, dead, all pids clear echo messages: wdmd warn, close, fail date set -x ./clientn 4 start $dev 1 none sleep 5 ./clientn 4 error $dev sleep 150 ./clientn 4 linear $dev 1 sleep 5 set +x echo test lockspace storage delay, small enough to have no effect echo messages: none date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 iodelay $dev 57 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test lockspace storage delay, long enough to produce sanlock warning, echo but not failure, not long enough for wdmd warn or close echo messages: sanlock check lease warn date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 iodelay $dev 67 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test lockspace storage delay, long enough to produce sanlock warning, echo but not failure/recovery, long enough for wdmd warn and close echo messages: sanlock check lease warn echo messages: wdmd warn, close date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 iodelay $dev 77 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test lockspace storage delay, long enough to produce sanlock warning, echo failure/recovery, recovery by lease release using killpath echo messages: sanlock check lease warn/fail, kill 100, all pids clear echo messages: killpath_pause echo messages: wdmd warn, close, fail date set -x ./clientn 4 start $dev 1 /root/killpath_pause sleep 22 ./clientn 4 iodelay $dev 87 sleep 5 set +x echo test lockspace storage delay, long enough to produce sanlock warning, echo failure/recovery, recovery by pid sigterm without killpath echo messages: sanlock check lease warn/fail, kill 15, dead, all pids clear echo messages: wdmd warn, close, fail date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 iodelay $dev 87 sleep 5 set +x echo test daemon run delay, small enough to have no effect echo messages: none date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 delay 58 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test daemon run delay, long enough to produce sanlock warning, echo but not failure, not long enough for wdmd warn or close echo messages: sanlock check lease warn date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 delay 68 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test daemon run delay, long enough to produce sanlock warning, echo but not failure, long enough for wdmd warn and close echo messages: sanlock check lease warn echo messages: wdmd warn, close date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 delay 78 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test daemon run delay, long enough to produce sanlock echo failure/recovery, recovery by lease release using killpath echo messages: sanlock check lease fail, kill 100, all pids clear echo messages: wdmd warn, close, fail echo messages: killpath_pause date set -x ./clientn 4 start $dev 1 /root/killpath_pause sleep 22 ./clientn 4 delay 88 sleep 5 ./clientn 4 resume $dev 1 sleep 5 killall -9 sanlk_client sleep 5 set +x echo test daemon run delay, long enough to produce sanlock echo failure/recovery, recovery by pid sigterm without killpath echo messages: sanlock check lease fail, kill 15, dead, all pids clear echo messages: wdmd warn, close, fail date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 delay 88 sleep 5 set +x echo test daemon run delay, long enough to produce sanlock echo failure/recovery, recovery by pid sigkill after skipping killpath echo messages: sanlock check lease fail, kill 9, dead, all pids clear echo messages: wdmd warn, close, fail date set -x ./clientn 4 start $dev 1 /root/killpath_pause sleep 22 ./clientn 4 delay 130 sleep 5 set +x echo test daemon run delay, long enough to produce sanlock echo failure/recovery, recovery by pid sigkill without killpath echo messages: sanlock check lease fail, kill 9, dead, all pids clear echo messages: wdmd warn, close, fail date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 delay 130 sleep 5 set +x echo test daemon run delay, long enough to produce watchdog firing echo messages: wdmd warn, close, fail date set -x ./clientn 4 start $dev 1 none sleep 22 ./clientn 4 delay 140 echo should not get here sanlock-3.8.2/tests/units.py000066400000000000000000000006101371427612200160360ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ Constants for file/disk sizes. """ KiB = 1024 MiB = 1024**2 GiB = 1024**3 TiB = 1024**4 PiB = 1024**5 sanlock-3.8.2/tests/util.py000066400000000000000000000122241371427612200156550ustar00rootroot00000000000000# Copyright (C) 2019 Red Hat, Inc. # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. """ Testing utilities """ from __future__ import absolute_import import errno import io import os import socket import struct import subprocess import time from . units import KiB TESTDIR = os.path.dirname(__file__) SANLOCK = os.path.join(TESTDIR, os.pardir, "src", "sanlock") GUARD = b"X" GUARD_SIZE = 4 * KiB class TimeoutExpired(Exception): """ Raised when timeout expired """ class CommandError(Exception): msg = ("Command {self.cmd} failed with returncode={self.returncode}, " "stdout={self.stdout!r}, stderr={self.stderr!r}") def __init__(self, cmd, returncode, stdout, stderr): self.cmd = cmd self.returncode = returncode self.stdout = stdout self.stderr = stderr def __str__(self): return self.msg.format(self=self) def start_daemon(): cmd = [SANLOCK, "daemon", # no fork and print all logging to stderr "-D", # don't use watchdog through wdmd "-w", "0", # don't use mlockall "-l", "0", # don't use high priority (RR) scheduling "-h", "0", # run as current user instead of "sanlock" "-U", os.environ["USER"], "-G", os.environ["USER"]] return subprocess.Popen(cmd) def wait_for_daemon(timeout): """ Wait until deamon is accepting connections """ deadline = time.time() + timeout path = os.path.join(os.environ["SANLOCK_RUN_DIR"], "sanlock.sock") s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: while True: try: s.connect(path) return except EnvironmentError as e: if e.errno not in (errno.ECONNREFUSED, errno.ENOENT): raise # Unexpected error if time.time() > deadline: raise TimeoutExpired time.sleep(0.05) finally: s.close() def sanlock(*args, cwd=None): """ Run sanlock returning the process stdout, or raising util.CommandError on failures. """ cmd = [SANLOCK] cmd.extend(args) p = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd) out, err = p.communicate() if p.returncode: raise CommandError(cmd, p.returncode, out, err) return out def wait_for_termination(p, timeout): """ Wait until process terminates, or timeout expires. """ deadline = time.time() + timeout while True: if p.poll() is not None: return if time.time() > deadline: raise TimeoutExpired time.sleep(0.05) def create_file(path, size, guard=True): """ Create sparse file of size bytes. If guard is True, add a guard area beyond the end of the file. """ with io.open(path, "wb") as f: f.truncate(size) if guard: write_guard(path, size) def write_guard(path, offset): """ Write guard areas at offset and fill with guard byte. Use check_guard() to verify that nothing was written to the guard area. """ with io.open(path, "rb+") as f: f.seek(offset) f.write(GUARD * GUARD_SIZE) def check_guard(path, offset): """ Assert that guard area at offset was not modified. """ with io.open(path, "rb") as f: f.seek(offset) assert f.read(GUARD_SIZE) == GUARD * GUARD_SIZE def check_rindex_entry(entry, name, offset=None, flags=None): # See src/ondisk.c rindex_entry_in() e_offset, e_flags, e_unused, e_name = struct.unpack(" #include #include #include #include #include #include #include #include #include #include #include #include "wdmd.h" #include "wdmd_sock.h" int wdmd_connect(void) { int rv, s; struct sockaddr_un addr; s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) return -errno; rv = wdmd_socket_address(&addr); if (rv < 0) return rv; rv = connect(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { rv = -errno; close(s); return rv; } return s; } int wdmd_register(int con, char *name) { struct wdmd_header h; int rv; if (strlen(name) > WDMD_NAME_SIZE) return -ENAMETOOLONG; memset(&h, 0, sizeof(h)); h.cmd = CMD_REGISTER; strncpy(h.name, name, WDMD_NAME_SIZE); rv = send(con, (void *)&h, sizeof(struct wdmd_header), 0); if (rv < 0) return -errno; return 0; } static int send_header(int con, int cmd) { struct wdmd_header h; int rv; memset(&h, 0, sizeof(h)); h.cmd = cmd; rv = send(con, (void *)&h, sizeof(struct wdmd_header), 0); if (rv < 0) return -errno; return 0; } int wdmd_refcount_set(int con) { return send_header(con, CMD_REFCOUNT_SET); } int wdmd_refcount_clear(int con) { return send_header(con, CMD_REFCOUNT_CLEAR); } int wdmd_test_live(int con, uint64_t renewal_time, uint64_t expire_time) { struct wdmd_header h; int rv; memset(&h, 0, sizeof(h)); h.cmd = CMD_TEST_LIVE; h.renewal_time = renewal_time; h.expire_time = expire_time; rv = send(con, (void *)&h, sizeof(struct wdmd_header), 0); if (rv < 0) return -errno; return 0; } int wdmd_status(int con, int *test_interval, int *fire_timeout, uint64_t *last_keepalive) { struct wdmd_header h; int rv; rv = send_header(con, CMD_STATUS); if (rv < 0) return rv; rv = recv(con, &h, sizeof(h), MSG_WAITALL); if (rv < 0) return -errno; *test_interval = h.test_interval; *fire_timeout = h.fire_timeout; *last_keepalive = h.last_keepalive; return 0; } sanlock-3.8.2/wdmd/main.c000066400000000000000000001051401371427612200152070ustar00rootroot00000000000000/* * Copyright 2011-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "wdmd.h" #include "wdmd_sock.h" #ifndef GNUC_UNUSED #define GNUC_UNUSED __attribute__((__unused__)) #endif #define DEFAULT_TEST_INTERVAL 10 #define RECOVER_TEST_INTERVAL 1 #define DEFAULT_FIRE_TIMEOUT 60 #define DEFAULT_HIGH_PRIORITY 0 /* * If the group name specified here, or specified on the * command line is not found, then default to gid 0 (root). */ #define SOCKET_GNAME "sanlock" #define DEFAULT_SOCKET_GID 0 #define DEFAULT_SOCKET_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) #define WDPATH_SIZE 64 static int test_interval = DEFAULT_TEST_INTERVAL; static int fire_timeout = DEFAULT_FIRE_TIMEOUT; static int high_priority = DEFAULT_HIGH_PRIORITY; static int daemon_quit; static int daemon_debug; static int socket_gid; static char *socket_gname = (char *)SOCKET_GNAME; static time_t last_keepalive; static time_t last_closeunclean; static char lockfile_path[PATH_MAX]; static int dev_fd = -1; static int shm_fd; static int allow_scripts; static int kill_script_sec; static const char *scripts_dir = "/etc/wdmd.d"; static char watchdog_path[WDPATH_SIZE]; static char option_path[WDPATH_SIZE]; static char saved_path[WDPATH_SIZE]; struct script_status { uint64_t start; int pid; int last_result; unsigned int run_count; unsigned int fail_count; unsigned int good_count; unsigned int kill_count; unsigned int long_count; char name[PATH_MAX]; }; #define MAX_SCRIPTS 8 static struct script_status scripts[MAX_SCRIPTS]; struct client { int used; int fd; int pid; int pid_dead; int refcount; uint64_t renewal; uint64_t expire; void *workfn; void *deadfn; char name[WDMD_NAME_SIZE]; }; #define CLIENT_NALLOC 16 static int client_maxi; static int client_size = 0; static struct client *client = NULL; static struct pollfd *pollfd = NULL; #define log_debug(fmt, args...) \ do { \ if (daemon_debug) \ fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ } while (0) #define log_error(fmt, args...) \ do { \ log_debug(fmt, ##args); \ syslog(LOG_ERR, fmt, ##args); \ } while (0) #define log_script(i) \ log_error("script %.64s last_result %d start %llu run %u fail %u good %u kill %u long %u", \ scripts[i].name, scripts[i].last_result, \ (unsigned long long)scripts[i].start, \ scripts[i].run_count, scripts[i].fail_count, \ scripts[i].good_count, scripts[i].kill_count, \ scripts[i].long_count); static uint64_t monotime(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return ts.tv_sec; } /* * test clients */ static void client_alloc(void) { int i; if (!client) { client = malloc(CLIENT_NALLOC * sizeof(struct client)); pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd)); } else { client = realloc(client, (client_size + CLIENT_NALLOC) * sizeof(struct client)); pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) * sizeof(struct pollfd)); if (!pollfd) log_error("can't alloc for pollfd"); } if (!client || !pollfd) log_error("can't alloc for client array"); for (i = client_size; i < client_size + CLIENT_NALLOC; i++) { memset(&client[i], 0, sizeof(struct client)); client[i].fd = -1; pollfd[i].fd = -1; pollfd[i].revents = 0; } client_size += CLIENT_NALLOC; } static int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)) { int i; if (!client) client_alloc(); again: for (i = 0; i < client_size; i++) { if (!client[i].used) { client[i].used = 1; client[i].workfn = workfn; client[i].deadfn = deadfn; client[i].fd = fd; pollfd[i].fd = fd; pollfd[i].events = POLLIN; if (i > client_maxi) client_maxi = i; return i; } } client_alloc(); goto again; } static void client_pid_dead(int ci) { if (!client[ci].expire) { log_debug("client_pid_dead ci %d", ci); close(client[ci].fd); /* refcount automatically dropped if a client with no expiration is closed */ client[ci].used = 0; memset(&client[ci], 0, sizeof(struct client)); client[ci].fd = -1; pollfd[ci].fd = -1; pollfd[ci].events = 0; } else { /* * Leave used and expire set so that test_clients will continue * monitoring this client and expire if necessary. * * Leave refcount set so that the daemon will not cleanly shut * down if it gets a sigterm. * * This case of a client con with an expire time being closed * is a fatal condition; there's no way to clear or extend the * expire time and no way to cleanly shut down the daemon. * This should never happen. * * (We don't enforce that a client with an expire time also has refcount * set, but I can't think of case where setting expire but not refcount * would be useful.) */ log_error("client dead ci %d fd %d pid %d renewal %llu expire %llu %s", ci, client[ci].fd, client[ci].pid, (unsigned long long)client[ci].renewal, (unsigned long long)client[ci].expire, client[ci].name); close(client[ci].fd); client[ci].pid_dead = 1; client[ci].fd = -1; pollfd[ci].fd = -1; pollfd[ci].events = 0; } } static int get_peer_pid(int fd, int *pid) { struct ucred cred; unsigned int cl = sizeof(cred); if (getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cred, &cl) != 0) return -1; *pid = cred.pid; return 0; } #define DEBUG_SIZE (1024 * 1024) #define LINE_SIZE 256 char debug_buf[DEBUG_SIZE]; static void dump_debug(int fd) { char line[LINE_SIZE]; uint64_t now; int line_len; int debug_len = 0; int i; memset(debug_buf, 0, DEBUG_SIZE); now = monotime(); memset(line, 0, sizeof(line)); snprintf(line, 255, "wdmd %d socket_gid %d high_priority %d now %llu last_keepalive %llu last_closeunclean %llu allow_scripts %d kill_script_sec %d\n", getpid(), socket_gid, high_priority, (unsigned long long)now, (unsigned long long)last_keepalive, (unsigned long long)last_closeunclean, allow_scripts, kill_script_sec); line_len = strlen(line); strncat(debug_buf, line, LINE_SIZE); debug_len += line_len; for (i = 0; i < MAX_SCRIPTS; i++) { if (!scripts[i].name[0]) continue; memset(line, 0, sizeof(line)); snprintf(line, 255, "script %d name %.64s pid %d now %llu start %llu last_result %d run %u fail %u good %u kill %u long %u\n", i, scripts[i].name, scripts[i].pid, (unsigned long long)now, (unsigned long long)scripts[i].start, scripts[i].last_result, scripts[i].run_count, scripts[i].fail_count, scripts[i].good_count, scripts[i].kill_count, scripts[i].long_count); line_len = strlen(line); if (debug_len + line_len >= DEBUG_SIZE - 1) goto out; strncat(debug_buf, line, LINE_SIZE); debug_len += line_len; } for (i = 0; i < client_size; i++) { if (!client[i].used) continue; memset(line, 0, sizeof(line)); snprintf(line, 255, "client %d name %.64s pid %d fd %d dead %d ref %d now %llu renewal %llu expire %llu\n", i, client[i].name, client[i].pid, client[i].fd, client[i].pid_dead, client[i].refcount, (unsigned long long)now, (unsigned long long)client[i].renewal, (unsigned long long)client[i].expire); line_len = strlen(line); if (debug_len + line_len >= DEBUG_SIZE - 1) goto out; strncat(debug_buf, line, LINE_SIZE); debug_len += line_len; } out: send(fd, debug_buf, debug_len, MSG_NOSIGNAL); } static void process_connection(int ci) { struct wdmd_header h; struct wdmd_header h_ret; void (*deadfn)(int ci); int rv, pid; memset(&h, 0, sizeof(h)); rv = recv(client[ci].fd, &h, sizeof(h), MSG_WAITALL); if (!rv) return; if (rv < 0) { log_error("ci %d recv error %d", ci, errno); goto dead; } if (rv != sizeof(h)) { log_error("ci %d recv size %d", ci, rv); goto dead; } switch(h.cmd) { case CMD_REGISTER: /* TODO: allow client to reconnect, search clients for h.name and copy the renewal and expire times, then clear the old client entry */ rv = get_peer_pid(client[ci].fd, &pid); if (rv < 0) goto dead; client[ci].pid = pid; memcpy(client[ci].name, h.name, WDMD_NAME_SIZE); log_debug("register ci %d fd %d pid %d %s", ci, client[ci].fd, pid, client[ci].name); break; case CMD_REFCOUNT_SET: client[ci].refcount = 1; break; case CMD_REFCOUNT_CLEAR: client[ci].refcount = 0; break; case CMD_TEST_LIVE: client[ci].renewal = h.renewal_time; client[ci].expire = h.expire_time; log_debug("test_live ci %d renewal %llu expire %llu", ci, (unsigned long long)client[ci].renewal, (unsigned long long)client[ci].expire); break; case CMD_STATUS: memcpy(&h_ret, &h, sizeof(h)); h_ret.test_interval = test_interval; h_ret.fire_timeout = fire_timeout; h_ret.last_keepalive = last_keepalive; send(client[ci].fd, &h_ret, sizeof(h_ret), MSG_NOSIGNAL); break; case CMD_DUMP_DEBUG: strncpy(client[ci].name, "dump", WDMD_NAME_SIZE); dump_debug(client[ci].fd); break; }; return; dead: deadfn = client[ci].deadfn; if (deadfn) deadfn(ci); } static void process_listener(int ci) { int fd; int on = 1; fd = accept(client[ci].fd, NULL, NULL); if (fd < 0) return; setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof(on)); client_add(fd, process_connection, client_pid_dead); } static void close_clients(void) { } static int setup_listener_socket(int *listener_socket) { int rv, s; struct sockaddr_un addr; s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) return -errno; rv = wdmd_socket_address(&addr); if (rv < 0) return rv; unlink(addr.sun_path); rv = bind(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) { rv = -errno; close(s); return rv; } rv = listen(s, 5); if (rv < 0) { rv = -errno; close(s); return rv; } rv = chmod(addr.sun_path, DEFAULT_SOCKET_MODE); if (rv < 0) { rv = -errno; close(s); return rv; } rv = chown(addr.sun_path, -1, socket_gid); if (rv < 0) { rv = -errno; close(s); return rv; } fcntl(s, F_SETFL, fcntl(s, F_GETFL, 0) | O_NONBLOCK); *listener_socket = s; return 0; } static int setup_clients(void) { int rv, fd = -1, ci; rv = setup_listener_socket(&fd); if (rv < 0) return rv; ci = client_add(fd, process_listener, client_pid_dead); strncpy(client[ci].name, "listen", WDMD_NAME_SIZE); return 0; } static int test_clients(void) { uint64_t t; time_t last_ping; int fail_count = 0; int i; t = monotime(); for (i = 0; i < client_size; i++) { if (!client[i].used) continue; if (!client[i].expire) continue; if (last_keepalive > last_closeunclean) last_ping = last_keepalive; else last_ping = last_closeunclean; if (t >= client[i].expire) { log_error("test failed rem %d now %llu ping %llu close %llu renewal %llu expire %llu client %d %s", DEFAULT_FIRE_TIMEOUT - (int)(t - last_ping), (unsigned long long)t, (unsigned long long)last_keepalive, (unsigned long long)last_closeunclean, (unsigned long long)client[i].renewal, (unsigned long long)client[i].expire, client[i].pid, client[i].name); fail_count++; continue; } /* * If we can patch the kernel to avoid a close-ping, * then we can remove this early/preemptive fail/close * of the device, but instead just not pet the device * when the expiration time is reached. Also see * close_watchdog_unclean() below. * * We do this fail/close (which generates a ping) * TEST_INTERVAL before the expire time because we want * the device to fire at most 60 seconds after the * expiration time. That means we need the last ping * (from close) to be TEST_INTERVAL before to the * expiration time. * * If we did the close at/after the expiration time, * then the ping from the close would mean that the * device would fire between 60 and 70 seconds after the * expiration time. */ if (t >= client[i].expire - DEFAULT_TEST_INTERVAL) { log_error("test warning now %llu ping %llu close %llu renewal %llu expire %llu client %d %s", (unsigned long long)t, (unsigned long long)last_keepalive, (unsigned long long)last_closeunclean, (unsigned long long)client[i].renewal, (unsigned long long)client[i].expire, client[i].pid, client[i].name); fail_count++; continue; } } return fail_count; } static int active_clients(void) { int i; for (i = 0; i < client_size; i++) { if (client[i].refcount) return 1; } return 0; } #ifdef TEST_FILES #define FILES_DIR "/run/wdmd/test_files" const char *files_built = " files"; static DIR *files_dir; static void close_files(void) { closedir(files_dir); } static int setup_files(void) { mode_t old_umask; int rv; old_umask = umask(0022); rv = mkdir(FILES_DIR, 0777); if (rv < 0 && errno != EEXIST) goto out; files_dir = opendir(FILES_DIR); if (!files_dir) rv = -errno; else rv = 0; out: umask(old_umask); return rv; } static int read_file(char *name, uint64_t *renewal, uint64_t *expire) { FILE *file; char path[PATH_MAX]; snprintf(path, PATH_MAX-1, "%s/%s", FILES_DIR, name); file = fopen(path, "r"); if (!file) return -1; fscanf(file, "renewal %llu expire %llu", renewal, expire); fclose(file); return 0; } static int test_files(void) { struct dirent *de; uint64_t t, renewal, expire; int fail_count = 0; int rv; while ((de = readdir(files_dir))) { if (de->d_name[0] == '.') continue; rv = read_file(de->d_name, &renewal, &expire); if (rv < 0) continue; t = monotime(); if (t >= expire) { log_error("test failed file %s renewal %llu expire %llu ", de->d_name, (unsigned long long)renewal, (unsigned long long)expire); fail_count++; } } return fail_count; } #else static void close_files(void) { } static int setup_files(void) { return 0; } static int test_files(void) { return 0; } #endif /* TEST_FILES */ static int find_script(char *name) { int i; for (i = 0; i < MAX_SCRIPTS; i++) { if (!strncmp(scripts[i].name, name, PATH_MAX)) return i; } return -1; } static int add_script(char *name) { int i; for (i = 0; i < MAX_SCRIPTS; i++) { if (scripts[i].name[0]) continue; log_debug("add_script %d %s", i, name); strncpy(scripts[i].name, name, PATH_MAX); return i; } log_debug("script %s no space", name); return -1; } static int check_path(char *path) { struct stat st; int rv; rv = stat(path, &st); if (rv < 0) return -errno; if (!(S_ISREG(st.st_mode))) return -1; if (!(st.st_mode & S_IXUSR)) return -1; return 0; } static int run_script(int i) { char path[PATH_MAX]; int pid, rv; memset(path, 0, sizeof(path)); snprintf(path, PATH_MAX-1, "%s/%s", scripts_dir, scripts[i].name); rv = check_path(path); if (rv < 0) return rv; pid = fork(); if (pid < 0) return -errno; if (pid) { log_debug("script %s pid %d", scripts[i].name, pid); return pid; } else { execlp(path, path, NULL); exit(EXIT_FAILURE); } } static void close_scripts(void) { } static int setup_scripts(void) { char path[PATH_MAX]; struct dirent **namelist; int i, s, rv, de_count; if (!allow_scripts) return 0; de_count = scandir(scripts_dir, &namelist, 0, alphasort); if (de_count < 0) return 0; for (i = 0; i < de_count; i++) { if (namelist[i]->d_name[0] == '.') goto next; memset(path, 0, sizeof(path)); snprintf(path, PATH_MAX-1, "%s/%s", scripts_dir, namelist[i]->d_name); rv = check_path(path); if (rv < 0) { log_debug("script %s ignore %d", namelist[i]->d_name, rv); goto next; } s = find_script(namelist[i]->d_name); if (s < 0) add_script(namelist[i]->d_name); next: free(namelist[i]); } free(namelist); return 0; } static int test_scripts(void) { int i, rv, pid, result, running, fail_count, status; uint64_t begin, now; if (!allow_scripts) return 0; fail_count = 0; begin = monotime(); for (i = 0; i < MAX_SCRIPTS; i++) { if (!scripts[i].name[0]) continue; /* pid didn't exit in previous cycle */ if (scripts[i].pid) continue; /* * after a script reports success, don't call it again before * the normal test interval; this is needed because the test * interval becomes shorter when failures occur */ if (!scripts[i].last_result && ((begin - scripts[i].start) < (DEFAULT_TEST_INTERVAL - 1))) continue; pid = run_script(i); if (pid <= 0) { log_error("script %s removed %d", scripts[i].name, pid); memset(&scripts[i], 0, sizeof(struct script_status)); } else { scripts[i].pid = pid; scripts[i].start = begin; scripts[i].run_count++; } } /* wait up to DEFAULT_TEST_INTERVAL-1 for the pids to finish */ while (1) { running = 0; for (i = 0; i < MAX_SCRIPTS; i++) { if (!scripts[i].name[0]) continue; if (!scripts[i].pid) continue; rv = waitpid(scripts[i].pid, &status, WNOHANG); if (rv < 0) { /* shouldn't happen */ log_error("script %s pid %d waitpid error %d %d", scripts[i].name, scripts[i].pid, rv, errno); log_script(i); running++; } else if (!rv) { /* pid still running, has not changed state */ running++; } else if (rv == scripts[i].pid) { /* pid state has changed */ if (WIFEXITED(status)) { /* pid exited with an exit code */ result = WEXITSTATUS(status); if (result) { log_error("script %s pid %d exit status %d", scripts[i].name, scripts[i].pid, result); scripts[i].fail_count++; scripts[i].last_result = result; scripts[i].pid = 0; fail_count++; log_script(i); } else { scripts[i].good_count++; scripts[i].last_result = 0; scripts[i].pid = 0; } } else if (WIFSIGNALED(status)) { /* pid terminated due to a signal */ log_error("script %s pid %d term signal %d", scripts[i].name, scripts[i].pid, WTERMSIG(status)); scripts[i].kill_count++; scripts[i].last_result = EINTR; scripts[i].pid = 0; fail_count++; log_script(i); } else { /* pid state changed but still running */ running++; } } else { /* shouldn't happen */ log_error("script %s pid %d waitpid rv %d", scripts[i].name, scripts[i].pid, rv); log_script(i); running++; } /* option to kill script after it's run for kill_script_sec */ if (scripts[i].pid && kill_script_sec && (monotime() - scripts[i].start >= kill_script_sec)) { kill(scripts[i].pid, SIGKILL); } } if (!running) break; if (monotime() - begin >= DEFAULT_TEST_INTERVAL - 1) break; sleep(1); } if (!running) goto out; /* any pids that have not exited count as a failed for this cycle */ now = monotime(); for (i = 0; i < MAX_SCRIPTS; i++) { if (!scripts[i].name[0]) continue; if (!scripts[i].pid) continue; scripts[i].long_count++; fail_count++; log_error("script %s pid %d start %llu now %llu taking too long", scripts[i].name, scripts[i].pid, (unsigned long long)scripts[i].start, (unsigned long long)now); log_script(i); } out: return fail_count; } static int open_dev(void) { int fd; if (dev_fd != -1) { log_error("watchdog already open fd %d", dev_fd); return -1; } fd = open(watchdog_path, O_WRONLY | O_CLOEXEC); if (fd < 0) { log_error("open %s error %d", watchdog_path, errno); return fd; } dev_fd = fd; return 0; } static void close_watchdog_unclean(void) { if (dev_fd == -1) { log_debug("close_watchdog_unclean already closed"); return; } log_error("%s closed unclean", watchdog_path); close(dev_fd); dev_fd = -1; last_closeunclean = monotime(); } static void close_watchdog(void) { int rv; if (dev_fd == -1) { log_error("close_watchdog already closed"); return; } rv = write(dev_fd, "V", 1); if (rv < 0) log_error("%s disarm write error %d", watchdog_path, errno); else log_error("%s disarmed", watchdog_path); close(dev_fd); dev_fd = -1; } static int _setup_watchdog(char *path) { struct stat buf; int rv, timeout; strncpy(watchdog_path, path, WDPATH_SIZE); watchdog_path[WDPATH_SIZE - 1] = '\0'; rv = stat(watchdog_path, &buf); if (rv < 0) return -1; rv = open_dev(); if (rv < 0) return -1; timeout = 0; rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &timeout); if (rv < 0) { log_error("%s failed to report timeout", watchdog_path); close_watchdog(); return -1; } if (timeout == fire_timeout) goto out; timeout = fire_timeout; rv = ioctl(dev_fd, WDIOC_SETTIMEOUT, &timeout); if (rv < 0) { log_error("%s failed to set timeout", watchdog_path); close_watchdog(); return -1; } if (timeout != fire_timeout) { log_error("%s failed to set new timeout", watchdog_path); close_watchdog(); return -1; } out: log_error("%s armed with fire_timeout %d", watchdog_path, fire_timeout); /* TODO: save watchdog_path in /run/wdmd/saved_path, * and in startup read that file, copying it to saved_path */ return 0; } /* * Order of preference: * . saved path (path used before daemon restart) * . command line option (-w) * . /dev/watchdog0 * . /dev/watchdog1 * . /dev/watchdog */ static int setup_watchdog(void) { int rv; if (!saved_path[0]) goto opt; rv = _setup_watchdog(saved_path); if (!rv) return 0; opt: if (!option_path[0] || !strcmp(saved_path, option_path)) goto zero; rv = _setup_watchdog(option_path); if (!rv) return 0; zero: if (!strcmp(saved_path, "/dev/watchdog0") || !strcmp(option_path, "/dev/watchdog0")) goto one; rv = _setup_watchdog((char *)"/dev/watchdog0"); if (!rv) return 0; one: if (!strcmp(saved_path, "/dev/watchdog1") || !strcmp(option_path, "/dev/watchdog1")) goto old; rv = _setup_watchdog((char *)"/dev/watchdog1"); if (!rv) return 0; old: if (!strcmp(saved_path, "/dev/watchdog") || !strcmp(option_path, "/dev/watchdog")) goto out; rv = _setup_watchdog((char *)"/dev/watchdog"); if (!rv) return 0; out: log_error("no watchdog device, load a watchdog driver"); return -1; } static int probe_dev(const char *path) { struct stat buf; int fd, err, rv, timeout; rv = stat(path, &buf); if (rv < 0) { fprintf(stderr, "error %d stat %s\n", errno, path); return -1; } fd = open(path, O_WRONLY | O_CLOEXEC); if (fd < 0) { fprintf(stderr, "error %d open %s\n", errno, path); return fd; } timeout = 0; rv = ioctl(fd, WDIOC_GETTIMEOUT, &timeout); if (rv < 0) { fprintf(stderr, "error %d ioctl gettimeout %s\n", errno, path); rv = -1; goto out; } if (timeout == fire_timeout) { printf("%s\n", path); rv = 0; goto out; } timeout = fire_timeout; rv = ioctl(fd, WDIOC_SETTIMEOUT, &timeout); if (rv < 0) { fprintf(stderr, "error %d ioctl settimeout %s\n", errno, path); rv = -1; goto out; } if (timeout != fire_timeout) { fprintf(stderr, "error %d invalid timeout %s\n", errno, path); rv = -1; goto out; } printf("%s\n", path); rv = 0; out: err = write(fd, "V", 1); if (err < 0) { fprintf(stderr, "probe failed to disarm %s error %d %d\n", path, err, errno); openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON); syslog(LOG_ERR, "probe failed to disarm %s error %d %d\n", path, err, errno); } close(fd); return rv; } static int probe_watchdog(void) { int rv; if (!saved_path[0]) goto opt; rv = probe_dev(saved_path); if (!rv) return 0; opt: if (!option_path[0] || !strcmp(saved_path, option_path)) goto zero; rv = probe_dev(option_path); if (!rv) return 0; zero: if (!strcmp(saved_path, "/dev/watchdog0") || !strcmp(option_path, "/dev/watchdog0")) goto one; rv = probe_dev((char *)"/dev/watchdog0"); if (!rv) return 0; one: if (!strcmp(saved_path, "/dev/watchdog1") || !strcmp(option_path, "/dev/watchdog1")) goto old; rv = probe_dev((char *)"/dev/watchdog1"); if (!rv) return 0; old: if (!strcmp(saved_path, "/dev/watchdog") || !strcmp(option_path, "/dev/watchdog")) goto out; rv = probe_dev((char *)"/dev/watchdog"); if (!rv) return 0; out: fprintf(stderr, "no watchdog device, load a watchdog driver\n"); return -1; } static void pet_watchdog(void) { int rv, unused; rv = ioctl(dev_fd, WDIOC_KEEPALIVE, &unused); last_keepalive = monotime(); log_debug("keepalive %d", rv); } static void process_signals(int ci) { struct signalfd_siginfo fdsi; ssize_t rv; int fd = client[ci].fd; rv = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); if (rv != sizeof(struct signalfd_siginfo)) { return; } if ((fdsi.ssi_signo == SIGTERM) || (fdsi.ssi_signo == SIGINT)) { if (!active_clients()) daemon_quit = 1; } if (fdsi.ssi_signo == SIGHUP) { setup_scripts(); } } static int setup_signals(void) { sigset_t mask; int fd, rv, ci; sigemptyset(&mask); sigaddset(&mask, SIGTERM); sigaddset(&mask, SIGINT); sigaddset(&mask, SIGHUP); rv = sigprocmask(SIG_BLOCK, &mask, NULL); if (rv < 0) return rv; fd = signalfd(-1, &mask, 0); if (fd < 0) return -errno; ci = client_add(fd, process_signals, client_pid_dead); strncpy(client[ci].name, "signal", WDMD_NAME_SIZE); return 0; } /* * We're trying to detect whether the last wdmd exited uncleanly and the * system has not been reset since. In that case we don't want to start * and open /dev/watchdog, because that will ping the wd which will extend * the pending reset, which needs to happen on schedule. * * To detect this, we want to do/set something on the system that will * not go away (be cleared) if we exit, but will go away if the system * is reset. If we were certain there was a tmpfs file system we could * use, then we could create a file there and just refuse to start if * the file exists. * * Until we are certain of tmpfs somewhere, create a shared mem object * on the system. */ static int setup_shm(void) { int rv; rv = shm_open("/wdmd", O_RDWR|O_CREAT|O_EXCL, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH); if (rv < 0) { log_error("other wdmd not cleanly stopped, shm_open error %d", errno); return rv; } shm_fd = rv; return 0; } static void close_shm(void) { shm_unlink("/wdmd"); close(shm_fd); } static int test_loop(void) { void (*workfn) (int ci); void (*deadfn) (int ci); uint64_t test_time; int poll_timeout; int sleep_seconds; int fail_count; int rv, i; pet_watchdog(); test_time = 0; poll_timeout = test_interval * 1000; while (1) { rv = poll(pollfd, client_maxi + 1, poll_timeout); if (rv == -1 && errno == EINTR) continue; if (rv < 0) { /* not sure */ } for (i = 0; i <= client_maxi; i++) { if (client[i].fd < 0) continue; if (pollfd[i].revents & POLLIN) { workfn = client[i].workfn; if (workfn) workfn(i); } if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { deadfn = client[i].deadfn; if (deadfn) deadfn(i); } } if (daemon_quit && !active_clients()) break; if (monotime() - test_time >= test_interval) { test_time = monotime(); log_debug("test_time %llu", (unsigned long long)test_time); fail_count = 0; fail_count += test_files(); fail_count += test_scripts(); fail_count += test_clients(); if (!fail_count) { if (dev_fd == -1) { open_dev(); pet_watchdog(); log_error("%s reopen", watchdog_path); } else { pet_watchdog(); } test_interval = DEFAULT_TEST_INTERVAL; } else { /* If we can patch the kernel so that close does not generate a ping, then we can skip this close, and just not pet the device in this case. Also see test_client above. */ close_watchdog_unclean(); test_interval = RECOVER_TEST_INTERVAL; } } sleep_seconds = test_time + test_interval - monotime(); poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 500; log_debug("test_interval %d sleep_seconds %d poll_timeout %d", test_interval, sleep_seconds, poll_timeout); } return 0; } static int lockfile(void) { char buf[16]; struct flock lock; mode_t old_umask; int fd, rv; old_umask = umask(0022); rv = mkdir(WDMD_RUN_DIR, 0775); if (rv < 0 && errno != EEXIST) { umask(old_umask); return rv; } umask(old_umask); sprintf(lockfile_path, "%s/wdmd.pid", WDMD_RUN_DIR); fd = open(lockfile_path, O_CREAT|O_WRONLY|O_CLOEXEC, 0644); if (fd < 0) { log_error("lockfile open error %s: %s", lockfile_path, strerror(errno)); return -1; } lock.l_type = F_WRLCK; lock.l_start = 0; lock.l_whence = SEEK_SET; lock.l_len = 0; rv = fcntl(fd, F_SETLK, &lock); if (rv < 0) { log_error("lockfile setlk error %s: %s", lockfile_path, strerror(errno)); goto fail; } rv = ftruncate(fd, 0); if (rv < 0) { log_error("lockfile truncate error %s: %s", lockfile_path, strerror(errno)); goto fail; } memset(buf, 0, sizeof(buf)); snprintf(buf, sizeof(buf), "%d\n", getpid()); rv = write(fd, buf, strlen(buf)); if (rv <= 0) { log_error("lockfile write error %s: %s", lockfile_path, strerror(errno)); goto fail; } return fd; fail: close(fd); return -1; } static void setup_priority(void) { struct sched_param sched_param; int rv; if (!high_priority) return; rv = mlockall(MCL_CURRENT | MCL_FUTURE); if (rv < 0) { log_error("mlockall failed"); } rv = sched_get_priority_max(SCHED_RR); if (rv < 0) { log_error("could not get max scheduler priority err %d", errno); return; } sched_param.sched_priority = rv; rv = sched_setscheduler(0, SCHED_RR|SCHED_RESET_ON_FORK, &sched_param); if (rv < 0) { log_error("could not set RR|RESET_ON_FORK priority %d err %d", sched_param.sched_priority, errno); } } static int group_to_gid(char *arg) { struct group *gr; gr = getgrnam(arg); if (gr == NULL) { log_error("group '%s' not found, using socket gid: %i", arg, DEFAULT_SOCKET_GID); return DEFAULT_SOCKET_GID; } return gr->gr_gid; } static void print_debug_and_exit(void) { struct sockaddr_un addr; struct wdmd_header h; int rv, s; s = socket(AF_LOCAL, SOCK_STREAM, 0); if (s < 0) exit(1); rv = wdmd_socket_address(&addr); if (rv < 0) exit(1); rv = connect(s, (struct sockaddr *) &addr, sizeof(struct sockaddr_un)); if (rv < 0) exit(1); memset(&h, 0, sizeof(h)); h.cmd = CMD_DUMP_DEBUG; rv = send(s, (void *)&h, sizeof(struct wdmd_header), 0); if (rv < 0) exit(1); rv = recv(s, &debug_buf, DEBUG_SIZE, 0); if (rv < 0) exit(1); rv = write(STDOUT_FILENO, debug_buf, strlen(debug_buf)); exit(0); } static void print_usage_and_exit(int status) { printf("Usage:\n"); printf("wdmd [options]\n\n"); printf("--version, -V print version\n"); printf("--help, -h print usage\n"); printf("--dump, -d print debug from daemon\n"); printf("--probe, -p print path of functional watchdog device\n"); printf("-D debug: no fork and print all logging to stderr\n"); printf("-H 0|1 use high priority features (1 yes, 0 no, default %d)\n", DEFAULT_HIGH_PRIORITY); printf("-G group ownership for the socket\n"); printf("-S 0|1 allow script tests (default %d)\n", allow_scripts); printf("-s path to scripts dir (default %s)\n", scripts_dir); printf("-k kill unfinished scripts after num seconds (default %d)\n", kill_script_sec); printf("-w /dev/watchdog path to the watchdog device to try first\n"); exit(status); } static void print_version_and_exit(void) { printf("wdmd version %s\n", VERSION); exit(0); } /* If wdmd exits abnormally, /dev/watchdog will eventually fire, and clients can detect wdmd is gone and begin to shut down cleanly ahead of the reset. But what if wdmd is restarted before the wd fires? It will begin petting /dev/watchdog again, leaving the previous clients unprotected. I don't know if this situation is important enough to try to prevent. One way would be for wdmd to fail starting if it found a pid file left over from its previous run. */ int main(int argc, char *argv[]) { int do_probe = 0; int rv; while (1) { int c; int option_index = 0; static struct option long_options[] = { {"help", no_argument, 0, 'h' }, {"probe", no_argument, 0, 'p' }, {"dump", no_argument, 0, 'd' }, {"version", no_argument, 0, 'V' }, {0, 0, 0, 0 } }; c = getopt_long(argc, argv, "hpdVDH:G:S:s:k:w:", long_options, &option_index); if (c == -1) break; switch (c) { case 'h': print_usage_and_exit(0); break; case 'p': do_probe = 1; break; case 'd': print_debug_and_exit(); break; case 'V': print_version_and_exit(); break; case 'D': daemon_debug = 1; break; case 'G': socket_gname = strdup(optarg); break; case 'H': high_priority = atoi(optarg); break; case 'S': allow_scripts = atoi(optarg); break; case 's': scripts_dir = strdup(optarg); break; case 'k': kill_script_sec = atoi(optarg); break; case 'w': snprintf(option_path, WDPATH_SIZE, "%s", optarg); option_path[WDPATH_SIZE - 1] = '\0'; break; } } if (do_probe) { rv = setup_shm(); if (rv < 0) { fprintf(stderr, "cannot probe watchdog devices while wdmd is in use.\n"); openlog("wdmd-probe", LOG_CONS | LOG_PID, LOG_DAEMON); syslog(LOG_ERR, "cannot probe watchdog devices while wdmd is in use.\n"); exit(EXIT_FAILURE); } rv = probe_watchdog(); close_shm(); if (rv < 0) exit(EXIT_FAILURE); else exit(EXIT_SUCCESS); } socket_gid = group_to_gid(socket_gname); if (!daemon_debug) { if (daemon(0, 0) < 0) { fprintf(stderr, "cannot fork daemon\n"); exit(EXIT_FAILURE); } } openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON); log_error("wdmd started S%d H%d G%d", allow_scripts, high_priority, socket_gid); setup_priority(); rv = lockfile(); if (rv < 0) goto out; rv = setup_shm(); if (rv < 0) goto out_lockfile; rv = setup_signals(); if (rv < 0) goto out_shm; rv = setup_scripts(); if (rv < 0) goto out_lockfile; rv = setup_files(); if (rv < 0) goto out_scripts; rv = setup_clients(); if (rv < 0) goto out_files; rv = setup_watchdog(); if (rv < 0) goto out_clients; rv = test_loop(); close_watchdog(); out_clients: close_clients(); out_files: close_files(); out_scripts: close_scripts(); out_shm: close_shm(); out_lockfile: unlink(lockfile_path); out: return rv; } sanlock-3.8.2/wdmd/wdmd.8000066400000000000000000000061421371427612200151450ustar00rootroot00000000000000.TH WDMD 8 2011-08-01 .SH NAME wdmd \- watchdog multiplexing daemon .SH SYNOPSIS .B wdmd [OPTIONS] .SH DESCRIPTION This daemon opens /dev/watchdog and allows multiple independent sources to detmermine whether each KEEPALIVE is done. Every test interval (10 seconds), the daemon tests each source. If any test fails, the KEEPALIVE is not done. In a standard configuration, the watchdog timer will reset the system if no KEEPALIVE is done for 60 seconds ("fire timeout"). This means that if a single test fails 5-6 times in row, the watchdog will fire and reset the system. With multiple test sources, fewer separate failures back to back can also cause a reset, e.g. T seconds, P pass, F fail .br T00: test1 P, test2 P, test3 P: KEEPALIVE done .br T10: test1 F, test2 F, test3 P: KEEPALIVE skipped .br T20: test1 F, test2 P, test3 P: KEEPALIVE skipped .br T30: test1 P, test2 F, test3 P: KEEPALIVE skipped .br T40: test1 P, test2 P, test3 F: KEEPALIVE skipped .br T50: test1 F, test2 F, test3 P: KEEPALIVE skipped .br T60: test1 P, test2 F, test3 P: KEEPALIVE skipped .br T60: watchdog fires, system resets (Depending on timings, the system may be reset sometime shortly before T60, and the tests at T60 would not be run.) A crucial aspect to the design and function of wdmd is that if any single source does not pass tests for the fire timeout, the watchdog is guaranteed to fire, regardless of whether other sources on the system have passed or failed. A spurious reset due to the combined effects of multiple failing tests as shown above, is an accepted side effect. The wdmd init script will load the softdog module if no other watchdog module has been loaded. wdmd cannot be used on the system with any other program that needs to open /dev/watchdog, e.g. watchdog(8). .SS Test Source: clients Using libwdmd, programs connect to wdmd via a unix socket, and send regular messages to wdmd to update an expiry time for their connection. Every test interval, wdmd will check if the expiry time for a connection has been reached. If so, the test for that client fails. .SS Test Source: scripts wdmd will run scripts from a designated directory every test interval. If a script exits with 0, the test is considered a success, otherwise a failure. If a script does not exit by the end of the test interval, it is considered a failure. .SH OPTIONS .TP .B \-\-version, \-V Print version. .TP .B \-\-help, \-h Print usage. .TP .B \-\-dump, \-d Print debug information from the daemon. .TP .B \-\-probe, \-p Print path of functional watchdog device. Exit code 0 indicates a functional device was found. Exit code 1 indicates a functional device was not found. .TP .B \-D Enable debugging to stderr and don't fork. .TP .BI \-H " 0|1" Enable (1) or disable (0) high priority features such as realtime scheduling priority and mlockall. .TP .BI \-G " name" Group ownership for the socket. .TP .BI \-S " 0|1" Enable (1) or disable (0) script tests. .TP .BI \-s " path" Path to scripts dir. .TP .BI \-k " num" Kill unfinished scripts after num seconds. .TP .BI \-w " path" The path to the watchdog device to try first. sanlock-3.8.2/wdmd/wdmd.h000066400000000000000000000012471371427612200152260ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __WDMD_H__ #define __WDMD_H__ #define WDMD_NAME_SIZE 128 int wdmd_connect(void); int wdmd_register(int con, char *name); int wdmd_refcount_set(int con); int wdmd_refcount_clear(int con); int wdmd_test_live(int con, uint64_t renewal_time, uint64_t expire_time); int wdmd_status(int con, int *test_interval, int *fire_timeout, uint64_t *last_keepalive); #endif sanlock-3.8.2/wdmd/wdmd_client.c000066400000000000000000000027741371427612200165650ustar00rootroot00000000000000/* * Copyright 2011 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions * of the GNU General Public License v2 or (at your option) any later version. */ #include #include #include #include #include #include #include #include #include #include #include "wdmd.h" int main(int argc, char *argv[]) { char name[WDMD_NAME_SIZE]; uint64_t t, last_keepalive; int test_interval, fire_timeout; int con, rv; int i = 0; int iter = 10; if (argc > 1) iter = atoi(argv[1]); memset(name, 0, sizeof(name)); sprintf(name, "%s", "wdmd_client"); con = wdmd_connect(); printf("wdmd_connect %d\n", con); if (con < 0) return con; rv = wdmd_register(con, name); printf("wdmd_register %d\n", rv); if (rv < 0) return rv; rv = wdmd_status(con, &test_interval, &fire_timeout, &last_keepalive); printf("wdmd_status %d test_interval %d fire_timeout %d last_keepalive %llu\n", rv, test_interval, fire_timeout, (unsigned long long)last_keepalive); if (rv < 0) return rv; while (1) { sleep(10); t = time(NULL); rv = wdmd_test_live(con, t, t + 40); printf("wdmd_test_live %d %llu %llu\n", rv, (unsigned long long)t, (unsigned long long)(t + 40)); if (i++ > iter) break; } rv = wdmd_test_live(con, t, 0); printf("wdmd_test_live 0 %d\n", rv); return 0; } sanlock-3.8.2/wdmd/wdmd_sock.c000066400000000000000000000013421371427612200162340ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #include #include #include #include #include #include #include #include "wdmd.h" #include "wdmd_sock.h" int wdmd_socket_address(struct sockaddr_un *addr) { memset(addr, 0, sizeof(struct sockaddr_un)); addr->sun_family = AF_LOCAL; snprintf(addr->sun_path, sizeof(addr->sun_path) - 1, "%s/%s", WDMD_RUN_DIR, WDMD_SOCKET_NAME); return 0; } sanlock-3.8.2/wdmd/wdmd_sock.h000066400000000000000000000015131371427612200162410ustar00rootroot00000000000000/* * Copyright 2010-2011 Red Hat, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. */ #ifndef __WDMD_SOCK_H__ #define __WDMD_SOCK_H__ #define WDMD_RUN_DIR "/run/wdmd" #define WDMD_SOCKET_NAME "wdmd.sock" enum { CMD_REGISTER = 1, CMD_REFCOUNT_SET, CMD_REFCOUNT_CLEAR, CMD_TEST_LIVE, CMD_STATUS, CMD_DUMP_DEBUG, }; struct wdmd_header { uint32_t magic; uint32_t cmd; uint32_t len; uint32_t flags; uint32_t test_interval; uint32_t fire_timeout; uint64_t last_keepalive; uint64_t renewal_time; uint64_t expire_time; char name[WDMD_NAME_SIZE]; }; int wdmd_socket_address(struct sockaddr_un *addr); #endif