././@PaxHeader0000000000000000000000000000003200000000000010210 xustar0026 mtime=1743709206.65089 lxcfs-6.0.4/0000775000175000017500000000000014773562027013123 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/0000775000175000017500000000000014773561567014473 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/ISSUE_TEMPLATE.md0000664000175000017500000000166014773561567017203 0ustar00stgraberstgraberThe template below is mostly useful for bug reports and support questions. Feel free to remove anything which doesn't apply to you and add more information where it makes sense. # Required information * Distribution: * `cat /etc/os-release` or `cat /etc/lsb-release` * LXCFS version: * The output of * `uname -a` * `cat /proc/1/mounts` * `ps aux | grep lxcfs` * LXCFS logs # Issue description A brief description of what failed or what could be improved. If you have LXCFS crashing, please, collect a crash dump. # Steps to reproduce 1. Step one 2. Step two 3. Step three # Information to attach - [ ] any relevant kernel output (`dmesg`) - [ ] LXCFS daemon output / logs - [ ] LXCFS configuration (Which options were used to start a LXCFS daemon? `ps aux | grep lxcfs`) - [ ] in case of crash, a core dump (please, read [how to collect a core dump](https://github.com/lxc/lxcfs?tab=readme-ov-file#core-dump)) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/actions/0000775000175000017500000000000014773561567016133 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/actions/build/0000775000175000017500000000000014773561567017232 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/actions/build/action.yml0000664000175000017500000000222314773561567021231 0ustar00stgraberstgrabername: Build LXCFS description: Install dependencies and build the codebase inputs: compiler: required: true fuse: required: true os: required: true runs: using: "composite" steps: - name: Install dependencies shell: bash run: | echo "::group::Installing dependencies" sudo apt-get update -qq sudo apt-get install -qq \ ${{ inputs.compiler }} \ lib${{ inputs.fuse }}-dev \ meson \ pkg-config \ uuid-runtime echo "::endgroup::" - name: Compiler version shell: bash env: CC: ${{ inputs.compiler }} run: | echo "::group::Compiler version" ${CC} --version echo "::endgroup::" - name: Build shell: bash env: CC: ${{ inputs.compiler }} run: | echo "::group::Building LXCFS" # Standard build meson setup build \ -Ddocs=false \ -Dtests=true \ -Dinit-script=systemd \ -Dprefix=/usr \ -Db_sanitize=address,undefined meson compile -C build echo "::endgroup::" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/actions/testsuite/0000775000175000017500000000000014773561567020164 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/actions/testsuite/action.yml0000664000175000017500000000126114773561567022164 0ustar00stgraberstgrabername: Test suite description: Runs the testsuite inputs: compiler: required: true fuse: required: true os: required: true runs: using: "composite" steps: - name: Build LXCFS uses: ./.github/actions/build with: compiler: ${{ inputs.compiler }} fuse: ${{ inputs.fuse }} os: ${{ inputs.os }} - name: Test shell: bash env: CC: ${{ inputs.compiler }} run: | echo "::group::Running the testsuite" echo 1 | sudo tee /sys/fs/cgroup/cpuset/cgroup.clone_children || true sudo -E PATH="${PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" build/tests/main.sh echo "::endgroup::" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/workflows/0000775000175000017500000000000014773561567016530 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/workflows/commits.yml0000664000175000017500000000164414773561567020733 0ustar00stgraberstgrabername: Commits on: - pull_request permissions: contents: read jobs: dco-check: permissions: pull-requests: read # for tim-actions/get-pr-commits to get list of commits from the PR name: Signed-off-by (DCO) runs-on: ubuntu-24.04 steps: - name: Get PR Commits id: 'get-pr-commits' uses: tim-actions/get-pr-commits@master with: token: ${{ secrets.GITHUB_TOKEN }} - name: Check that all commits are signed-off uses: tim-actions/dco@master with: commits: ${{ steps.get-pr-commits.outputs.commits }} target-branch: permissions: contents: none name: Branch target runs-on: ubuntu-24.04 steps: - name: Check branch target env: TARGET: ${{ github.event.pull_request.base.ref }} run: | set -x [ "${TARGET}" = "main" ] && exit 0 echo "Invalid branch target: ${TARGET}" exit 1 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/workflows/coverity.yml0000664000175000017500000000326614773561567021126 0ustar00stgraberstgrabername: Coverity on: push: branches: - main permissions: contents: read jobs: coverity: name: Build and upload runs-on: ubuntu-24.04 steps: - name: Checkout code uses: actions/checkout@v4 - name: Download Coverity Build Tool run: | wget -q https://scan.coverity.com/download/cxx/linux64 --post-data "token=$TOKEN&project=lxc/lxcfs" -O cov-analysis-linux64.tar.gz mkdir cov-analysis-linux64 tar xzf cov-analysis-linux64.tar.gz --strip 1 -C cov-analysis-linux64 env: TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }} - name: Install dependencies run: | sudo apt-get update -qq sudo apt-get install -qq gcc clang meson sudo apt-get install -qq libfuse-dev uuid-runtime python3 python3-setuptools - name: Run coverity run: | # Configure export PATH="$(pwd)/cov-analysis-linux64/bin:${PATH}" export CFLAGS="-Wall -Werror" export LDFLAGS="-pthread -lpthread" BUILD="$(pwd)/build" meson setup -Ddocs=false -Dtests=true -Dinit-script=systemd -Dprefix=/usr build/ # Build cov-build --dir cov-int ninja -C ${BUILD} tar czvf upload.tgz cov-int # Submit the results curl \ --form project=lxc/lxcfs \ --form token=${TOKEN} \ --form email=lxc-devel@lists.linuxcontainers.org \ --form file=@upload.tgz \ --form version=main \ --form description="${GITHUB_SHA}" \ https://scan.coverity.com/builds?project=lxc/lxcfs env: TOKEN: ${{ secrets.COVERITY_SCAN_TOKEN }} ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.github/workflows/tests.yml0000664000175000017500000000537514773561567020427 0ustar00stgraberstgrabername: Tests on: - push - pull_request permissions: contents: read jobs: testsuite-hosted: name: Test suite (x86_64) strategy: fail-fast: false matrix: compiler: - gcc - clang fuse: - fuse - fuse3 os: - ubuntu-22.04 - ubuntu-24.04 runs-on: ${{ matrix.os }} steps: - name: Checkout code uses: actions/checkout@v4 - name: Run the testsuite uses: ./.github/actions/testsuite with: compiler: ${{ matrix.compiler }} fuse: ${{ matrix.fuse }} os: ${{ matrix.os }} testsuite-self-hosted: name: Test suite (aarch64) strategy: fail-fast: false matrix: compiler: - gcc - clang fuse: - fuse - fuse3 os: - ubuntu-22.04 - ubuntu-24.04 runs-on: - self-hosted - cpu-4 - mem-4G - disk-50G - arch-arm64 - image-${{ matrix.os }} steps: - name: Checkout code uses: actions/checkout@v4 - name: Run the testsuite uses: ./.github/actions/testsuite with: compiler: ${{ matrix.compiler }} fuse: ${{ matrix.fuse }} os: ${{ matrix.os }} upgrade-test: name: Upgrade test strategy: fail-fast: false matrix: compiler: - gcc fuse: - fuse3 os: - ubuntu-24.04 runs-on: ${{ matrix.os }} steps: - name: Checkout code uses: actions/checkout@v4 - name: Build LXCFS uses: ./.github/actions/build with: compiler: ${{ matrix.compiler }} fuse: ${{ matrix.fuse }} os: ${{ matrix.os }} - name: Build upstream head version env: CC: ${{ matrix.compiler }} run: | git clone -b stable-6.0 https://github.com/lxc/lxcfs.git ../upstream-lxcfs cd ../upstream-lxcfs meson setup -Ddocs=false -Dtests=true -Dinit-script=systemd -Dprefix=/usr -Db_sanitize=address,undefined build/ meson compile -C build - name: Test env: CC: ${{ matrix.compiler }} WORKSPACE_PATH: ${{ github.workspace }} run: | UPSTREAM_LXCFS_TREE=$(realpath ${WORKSPACE_PATH}/../upstream-lxcfs) NEW_LXCFS_TREE="${WORKSPACE_PATH}" echo "${NEW_LXCFS_TREE}" echo "${UPSTREAM_LXCFS_TREE}" cd $UPSTREAM_LXCFS_TREE [ -f build/tests/live-upgrade-test.sh ] || exit 0 echo 1 | sudo tee /sys/fs/cgroup/cpuset/cgroup.clone_children || true sudo -E PATH="${PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" build/tests/live-upgrade-test.sh "${NEW_LXCFS_TREE}" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/.gitignore0000664000175000017500000000043514773561567015125 0ustar00stgraberstgraberINSTALL lxcfs missing lxcfs.1 lxcfs.spec share/00-lxcfs.conf share/lxc.mount.hook share/lxc.reboot.hook tests/test-cpusetrange tests/test-read lxcfs_mkdir tests/test-syscalls config/init/systemd/lxcfs.service *.o tags lxcfs-*.tar.gz .libs *.lo *.la .vscode src/cgroups/.dirstamp .idea/././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/AUTHORS0000664000175000017500000000016714773561567014207 0ustar00stgraberstgraberThe list of authors and contributors can be retrieved from the git commit history and in some cases, the file headers. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/CONTRIBUTING.md0000664000175000017500000000452614773561567015373 0ustar00stgraberstgraber# Pull requests: Changes to this project should be proposed as pull requests on Github at: https://github.com/lxc/lxcfs Proposed changes will then go through code review there and once acked, be merged in the main branch. # Developer Certificate of Origin: To improve tracking of contributions to this project we use the DCO 1.1 and use a "sign-off" procedure for all changes going into the branch. The sign-off is a simple line at the end of the explanation for the commit which certifies that you wrote it or otherwise have the right to pass it on as an open-source contribution. > Developer Certificate of Origin > Version 1.1 > > Copyright (C) 2004, 2006 The Linux Foundation and its contributors. > 660 York Street, Suite 102, > San Francisco, CA 94110 USA > > Everyone is permitted to copy and distribute verbatim copies of this > license document, but changing it is not allowed. > > Developer's Certificate of Origin 1.1 > > By making a contribution to this project, I certify that: > > (a) The contribution was created in whole or in part by me and I > have the right to submit it under the open source license > indicated in the file; or > > (b) The contribution is based upon previous work that, to the best > of my knowledge, is covered under an appropriate open source > license and I have the right under that license to submit that > work with modifications, whether created in whole or in part > by me, under the same open source license (unless I am > permitted to submit under a different license), as indicated > in the file; or > > (c) The contribution was provided directly to me by some other > person who certified (a), (b) or (c) and I have not modified > it. > > (d) I understand and agree that this project and the contribution > are public and that a record of the contribution (including all > personal information I submit with it, including my sign-off) is > maintained indefinitely and may be redistributed consistent with > this project or the open source license(s) involved. An example of a valid sign-off line is: Signed-off-by: Random J Developer Use your real name and a valid e-mail address. Sorry, no pseudonyms or anonymous contributions are allowed. We also require each commit be individually signed-off by their author, even when part of a larger set. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/COPYING0000664000175000017500000000007614773561567014171 0ustar00stgraberstgraberLXCFS is provided under: SPDX-License-Identifier: LGPL-2.1+ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/ChangeLog0000664000175000017500000000000014773561567014673 0ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/LICENSE.LGPL2.10000664000175000017500000006364214773561567015131 0ustar00stgraberstgraber GNU LESSER GENERAL PUBLIC LICENSE Version 2.1, February 1999 Copyright (C) 1991, 1999 Free Software Foundation, Inc. 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. [This is the first released version of the Lesser GPL. It also counts as the successor of the GNU Library Public License, version 2, hence the version number 2.1.] Preamble The licenses for most software are designed to take away your freedom to share and change it. By contrast, the GNU General Public Licenses are intended to guarantee your freedom to share and change free software--to make sure the software is free for all its users. This license, the Lesser General Public License, applies to some specially designated software packages--typically libraries--of the Free Software Foundation and other authors who decide to use it. You can use it too, but we suggest you first think carefully about whether this license or the ordinary General Public License is the better strategy to use in any particular case, based on the explanations below. When we speak of free software, we are referring to freedom of use, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for this service if you wish); that you receive source code or can get it if you want it; that you can change the software and use pieces of it in new free programs; and that you are informed that you can do these things. To protect your rights, we need to make restrictions that forbid distributors to deny you these rights or to ask you to surrender these rights. These restrictions translate to certain responsibilities for you if you distribute copies of the library or if you modify it. For example, if you distribute copies of the library, whether gratis or for a fee, you must give the recipients all the rights that we gave you. You must make sure that they, too, receive or can get the source code. If you link other code with the library, you must provide complete object files to the recipients, so that they can relink them with the library after making changes to the library and recompiling it. And you must show them these terms so they know their rights. We protect your rights with a two-step method: (1) we copyright the library, and (2) we offer you this license, which gives you legal permission to copy, distribute and/or modify the library. To protect each distributor, we want to make it very clear that there is no warranty for the free library. Also, if the library is modified by someone else and passed on, the recipients should know that what they have is not the original version, so that the original author's reputation will not be affected by problems that might be introduced by others. Finally, software patents pose a constant threat to the existence of any free program. We wish to make sure that a company cannot effectively restrict the users of a free program by obtaining a restrictive license from a patent holder. Therefore, we insist that any patent license obtained for a version of the library must be consistent with the full freedom of use specified in this license. Most GNU software, including some libraries, is covered by the ordinary GNU General Public License. This license, the GNU Lesser General Public License, applies to certain designated libraries, and is quite different from the ordinary General Public License. We use this license for certain libraries in order to permit linking those libraries into non-free programs. When a program is linked with a library, whether statically or using a shared library, the combination of the two is legally speaking a combined work, a derivative of the original library. The ordinary General Public License therefore permits such linking only if the entire combination fits its criteria of freedom. The Lesser General Public License permits more lax criteria for linking other code with the library. We call this license the "Lesser" General Public License because it does Less to protect the user's freedom than the ordinary General Public License. It also provides other free software developers Less of an advantage over competing non-free programs. These disadvantages are the reason we use the ordinary General Public License for many libraries. However, the Lesser license provides advantages in certain special circumstances. For example, on rare occasions, there may be a special need to encourage the widest possible use of a certain library, so that it becomes a de-facto standard. To achieve this, non-free programs must be allowed to use the library. A more frequent case is that a free library does the same job as widely used non-free libraries. In this case, there is little to gain by limiting the free library to free software only, so we use the Lesser General Public License. In other cases, permission to use a particular library in non-free programs enables a greater number of people to use a large body of free software. For example, permission to use the GNU C Library in non-free programs enables many more people to use the whole GNU operating system, as well as its variant, the GNU/Linux operating system. Although the Lesser General Public License is Less protective of the users' freedom, it does ensure that the user of a program that is linked with the Library has the freedom and the wherewithal to run that program using a modified version of the Library. The precise terms and conditions for copying, distribution and modification follow. Pay close attention to the difference between a "work based on the library" and a "work that uses the library". The former contains code derived from the library, whereas the latter must be combined with the library in order to run. GNU LESSER GENERAL PUBLIC LICENSE TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 0. This License Agreement applies to any software library or other program which contains a notice placed by the copyright holder or other authorized party saying it may be distributed under the terms of this Lesser General Public License (also called "this License"). Each licensee is addressed as "you". A "library" means a collection of software functions and/or data prepared so as to be conveniently linked with application programs (which use some of those functions and data) to form executables. The "Library", below, refers to any such software library or work which has been distributed under these terms. A "work based on the Library" means either the Library or any derivative work under copyright law: that is to say, a work containing the Library or a portion of it, either verbatim or with modifications and/or translated straightforwardly into another language. (Hereinafter, translation is included without limitation in the term "modification".) "Source code" for a work means the preferred form of the work for making modifications to it. For a library, complete source code means all the source code for all modules it contains, plus any associated interface definition files, plus the scripts used to control compilation and installation of the library. Activities other than copying, distribution and modification are not covered by this License; they are outside its scope. The act of running a program using the Library is not restricted, and output from such a program is covered only if its contents constitute a work based on the Library (independent of the use of the Library in a tool for writing it). Whether that is true depends on what the Library does and what the program that uses the Library does. 1. You may copy and distribute verbatim copies of the Library's complete source code as you receive it, in any medium, provided that you conspicuously and appropriately publish on each copy an appropriate copyright notice and disclaimer of warranty; keep intact all the notices that refer to this License and to the absence of any warranty; and distribute a copy of this License along with the Library. You may charge a fee for the physical act of transferring a copy, and you may at your option offer warranty protection in exchange for a fee. 2. You may modify your copy or copies of the Library or any portion of it, thus forming a work based on the Library, and copy and distribute such modifications or work under the terms of Section 1 above, provided that you also meet all of these conditions: a) The modified work must itself be a software library. b) You must cause the files modified to carry prominent notices stating that you changed the files and the date of any change. c) You must cause the whole of the work to be licensed at no charge to all third parties under the terms of this License. d) If a facility in the modified Library refers to a function or a table of data to be supplied by an application program that uses the facility, other than as an argument passed when the facility is invoked, then you must make a good faith effort to ensure that, in the event an application does not supply such function or table, the facility still operates, and performs whatever part of its purpose remains meaningful. (For example, a function in a library to compute square roots has a purpose that is entirely well-defined independent of the application. Therefore, Subsection 2d requires that any application-supplied function or table used by this function must be optional: if the application does not supply it, the square root function must still compute square roots.) These requirements apply to the modified work as a whole. If identifiable sections of that work are not derived from the Library, and can be reasonably considered independent and separate works in themselves, then this License, and its terms, do not apply to those sections when you distribute them as separate works. But when you distribute the same sections as part of a whole which is a work based on the Library, the distribution of the whole must be on the terms of this License, whose permissions for other licensees extend to the entire whole, and thus to each and every part regardless of who wrote it. Thus, it is not the intent of this section to claim rights or contest your rights to work written entirely by you; rather, the intent is to exercise the right to control the distribution of derivative or collective works based on the Library. In addition, mere aggregation of another work not based on the Library with the Library (or with a work based on the Library) on a volume of a storage or distribution medium does not bring the other work under the scope of this License. 3. You may opt to apply the terms of the ordinary GNU General Public License instead of this License to a given copy of the Library. To do this, you must alter all the notices that refer to this License, so that they refer to the ordinary GNU General Public License, version 2, instead of to this License. (If a newer version than version 2 of the ordinary GNU General Public License has appeared, then you can specify that version instead if you wish.) Do not make any other change in these notices. Once this change is made in a given copy, it is irreversible for that copy, so the ordinary GNU General Public License applies to all subsequent copies and derivative works made from that copy. This option is useful when you wish to copy part of the code of the Library into a program that is not a library. 4. You may copy and distribute the Library (or a portion or derivative of it, under Section 2) in object code or executable form under the terms of Sections 1 and 2 above provided that you accompany it with the complete corresponding machine-readable source code, which must be distributed under the terms of Sections 1 and 2 above on a medium customarily used for software interchange. If distribution of object code is made by offering access to copy from a designated place, then offering equivalent access to copy the source code from the same place satisfies the requirement to distribute the source code, even though third parties are not compelled to copy the source along with the object code. 5. A program that contains no derivative of any portion of the Library, but is designed to work with the Library by being compiled or linked with it, is called a "work that uses the Library". Such a work, in isolation, is not a derivative work of the Library, and therefore falls outside the scope of this License. However, linking a "work that uses the Library" with the Library creates an executable that is a derivative of the Library (because it contains portions of the Library), rather than a "work that uses the library". The executable is therefore covered by this License. Section 6 states terms for distribution of such executables. When a "work that uses the Library" uses material from a header file that is part of the Library, the object code for the work may be a derivative work of the Library even though the source code is not. Whether this is true is especially significant if the work can be linked without the Library, or if the work is itself a library. The threshold for this to be true is not precisely defined by law. If such an object file uses only numerical parameters, data structure layouts and accessors, and small macros and small inline functions (ten lines or less in length), then the use of the object file is unrestricted, regardless of whether it is legally a derivative work. (Executables containing this object code plus portions of the Library will still fall under Section 6.) Otherwise, if the work is a derivative of the Library, you may distribute the object code for the work under the terms of Section 6. Any executables containing that work also fall under Section 6, whether or not they are linked directly with the Library itself. 6. As an exception to the Sections above, you may also combine or link a "work that uses the Library" with the Library to produce a work containing portions of the Library, and distribute that work under terms of your choice, provided that the terms permit modification of the work for the customer's own use and reverse engineering for debugging such modifications. You must give prominent notice with each copy of the work that the Library is used in it and that the Library and its use are covered by this License. You must supply a copy of this License. If the work during execution displays copyright notices, you must include the copyright notice for the Library among them, as well as a reference directing the user to the copy of this License. Also, you must do one of these things: a) Accompany the work with the complete corresponding machine-readable source code for the Library including whatever changes were used in the work (which must be distributed under Sections 1 and 2 above); and, if the work is an executable linked with the Library, with the complete machine-readable "work that uses the Library", as object code and/or source code, so that the user can modify the Library and then relink to produce a modified executable containing the modified Library. (It is understood that the user who changes the contents of definitions files in the Library will not necessarily be able to recompile the application to use the modified definitions.) b) Use a suitable shared library mechanism for linking with the Library. A suitable mechanism is one that (1) uses at run time a copy of the library already present on the user's computer system, rather than copying library functions into the executable, and (2) will operate properly with a modified version of the library, if the user installs one, as long as the modified version is interface-compatible with the version that the work was made with. c) Accompany the work with a written offer, valid for at least three years, to give the same user the materials specified in Subsection 6a, above, for a charge no more than the cost of performing this distribution. d) If distribution of the work is made by offering access to copy from a designated place, offer equivalent access to copy the above specified materials from the same place. e) Verify that the user has already received a copy of these materials or that you have already sent this user a copy. For an executable, the required form of the "work that uses the Library" must include any data and utility programs needed for reproducing the executable from it. However, as a special exception, the materials to be distributed need not include anything that is normally distributed (in either source or binary form) with the major components (compiler, kernel, and so on) of the operating system on which the executable runs, unless that component itself accompanies the executable. It may happen that this requirement contradicts the license restrictions of other proprietary libraries that do not normally accompany the operating system. Such a contradiction means you cannot use both them and the Library together in an executable that you distribute. 7. You may place library facilities that are a work based on the Library side-by-side in a single library together with other library facilities not covered by this License, and distribute such a combined library, provided that the separate distribution of the work based on the Library and of the other library facilities is otherwise permitted, and provided that you do these two things: a) Accompany the combined library with a copy of the same work based on the Library, uncombined with any other library facilities. This must be distributed under the terms of the Sections above. b) Give prominent notice with the combined library of the fact that part of it is a work based on the Library, and explaining where to find the accompanying uncombined form of the same work. 8. You may not copy, modify, sublicense, link with, or distribute the Library except as expressly provided under this License. Any attempt otherwise to copy, modify, sublicense, link with, or distribute the Library is void, and will automatically terminate your rights under this License. However, parties who have received copies, or rights, from you under this License will not have their licenses terminated so long as such parties remain in full compliance. 9. You are not required to accept this License, since you have not signed it. However, nothing else grants you permission to modify or distribute the Library or its derivative works. These actions are prohibited by law if you do not accept this License. Therefore, by modifying or distributing the Library (or any work based on the Library), you indicate your acceptance of this License to do so, and all its terms and conditions for copying, distributing or modifying the Library or works based on it. 10. Each time you redistribute the Library (or any work based on the Library), the recipient automatically receives a license from the original licensor to copy, distribute, link with or modify the Library subject to these terms and conditions. You may not impose any further restrictions on the recipients' exercise of the rights granted herein. You are not responsible for enforcing compliance by third parties with this License. 11. If, as a consequence of a court judgment or allegation of patent infringement or for any other reason (not limited to patent issues), conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not excuse you from the conditions of this License. If you cannot distribute so as to satisfy simultaneously your obligations under this License and any other pertinent obligations, then as a consequence you may not distribute the Library at all. For example, if a patent license would not permit royalty-free redistribution of the Library by all those who receive copies directly or indirectly through you, then the only way you could satisfy both it and this License would be to refrain entirely from distribution of the Library. If any portion of this section is held invalid or unenforceable under any particular circumstance, the balance of the section is intended to apply, and the section as a whole is intended to apply in other circumstances. It is not the purpose of this section to induce you to infringe any patents or other property right claims or to contest validity of any such claims; this section has the sole purpose of protecting the integrity of the free software distribution system which is implemented by public license practices. Many people have made generous contributions to the wide range of software distributed through that system in reliance on consistent application of that system; it is up to the author/donor to decide if he or she is willing to distribute software through any other system and a licensee cannot impose that choice. This section is intended to make thoroughly clear what is believed to be a consequence of the rest of this License. 12. If the distribution and/or use of the Library is restricted in certain countries either by patents or by copyrighted interfaces, the original copyright holder who places the Library under this License may add an explicit geographical distribution limitation excluding those countries, so that distribution is permitted only in or among countries not thus excluded. In such case, this License incorporates the limitation as if written in the body of this License. 13. The Free Software Foundation may publish revised and/or new versions of the Lesser General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. Each version is given a distinguishing version number. If the Library specifies a version number of this License which applies to it and "any later version", you have the option of following the terms and conditions either of that version or of any later version published by the Free Software Foundation. If the Library does not specify a license version number, you may choose any version ever published by the Free Software Foundation. 14. If you wish to incorporate parts of the Library into other free programs whose distribution conditions are incompatible with these, write to the author to ask for permission. For software which is copyrighted by the Free Software Foundation, write to the Free Software Foundation; we sometimes make exceptions for this. Our decision will be guided by the two goals of preserving the free status of all derivatives of our free software and of promoting the sharing and reuse of software generally. NO WARRANTY 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. END OF TERMS AND CONDITIONS How to Apply These Terms to Your New Libraries If you develop a new library, and you want it to be of the greatest possible use to the public, we recommend making it free software that everyone can redistribute and change. You can do so by permitting redistribution under these terms (or, alternatively, under the terms of the ordinary General Public License). To apply these terms, attach the following notices to the library. It is safest to attach them to the start of each source file to most effectively convey the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA Also add information on how to contact you by electronic and paper mail. You should also get your employer (if you work as a programmer) or your school, if any, to sign a "copyright disclaimer" for the library, if necessary. Here is a sample; alter the names: Yoyodyne, Inc., hereby disclaims all copyright interest in the library `Frob' (a library for tweaking knobs) written by James Random Hacker. , 1 April 1990 Ty Coon, President of Vice That's all there is to it! ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/Makefile0000664000175000017500000000046414773561567014577 0ustar00stgraberstgraber# SPDX-License-Identifier: LGPL-2.1-or-later .PHONY: all all: meson ninja -C build .PHONY: meson meson: [ -d build ] || meson setup build/ .PHONY: dist dist: meson meson dist -C build/ --formats=gztar cp build/meson-dist/*.tar.gz . .PHONY: install install: DESTDIR=$(DESTDIR) ninja -C build install ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/NEWS0000664000175000017500000000000014773561567013620 0ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/README0000664000175000017500000000000014773561567014001 0ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/README.md0000664000175000017500000001726414773561567014424 0ustar00stgraberstgraber# lxcfs ## Introduction LXCFS is a small FUSE filesystem written with the intention of making Linux containers feel more like a virtual machine. It started as a side-project of `LXC` but is useable by any runtime. LXCFS will take care that the information provided by crucial files in `procfs` such as: ``` /proc/cpuinfo /proc/diskstats /proc/meminfo /proc/stat /proc/swaps /proc/uptime /proc/slabinfo /sys/devices/system/cpu/online ``` are container aware such that the values displayed (e.g. in `/proc/uptime`) really reflect how long the container is running and not how long the host is running. Prior to the implementation of cgroup namespaces by Serge Hallyn `LXCFS` also provided a container aware `cgroupfs` tree. It took care that the container only had access to cgroups underneath it's own cgroups and thus provided additional safety. For systems without support for cgroup namespaces `LXCFS` will still provide this feature but it is mostly considered deprecated. ## Upgrading `LXCFS` without restart `LXCFS` is split into a shared library (a libtool module, to be precise) `liblxcfs` and a simple binary `lxcfs`. When upgrading to a newer version of `LXCFS` the `lxcfs` binary will not be restarted. Instead it will detect that a new version of the shared library is available and will reload it using `dlclose(3)` and `dlopen(3)`. This design was chosen so that the fuse main loop that `LXCFS` uses will not need to be restarted. If it were then all containers using `LXCFS` would need to be restarted since they would otherwise be left with broken fuse mounts. To force a reload of the shared library at the next possible instance simply send `SIGUSR1` to the pid of the running `LXCFS` process. This can be as simple as doing: rm /usr/lib64/lxcfs/liblxcfs.so # MUST to delete the old library file first cp liblxcfs.so /usr/lib64/lxcfs/liblxcfs.so # to place new library file kill -s USR1 $(pidof lxcfs) # reload ### musl To achieve smooth upgrades through shared library reloads `LXCFS` also relies on the fact that when `dlclose(3)` drops the last reference to the shared library destructors are run and when `dlopen(3)` is called constructors are run. While this is true for `glibc` it is not true for `musl` (See the section [Unloading libraries](https://wiki.musl-libc.org/functional-differences-from-glibc.html).). So users of `LXCFS` on `musl` are advised to restart `LXCFS` completely and all containers making use of it. ## Building In order to build LXCFS install fuse and the fuse development headers according to your distro. LXCFS prefers `fuse3` but does work with new enough `fuse2` versions: git clone git://github.com/lxc/lxcfs cd lxcfs meson setup -Dinit-script=systemd --prefix=/usr build/ meson compile -C build/ sudo meson install -C build/ To build with sanitizers you have to specify `-Db_sanitize=...` option to `meson setup`. For example, to enable ASAN and UBSAN: meson setup -Dinit-script=systemd --prefix=/usr build/ -Db_sanitize=address,undefined meson compile -C build/ ## Usage The recommended command to run lxcfs is: sudo mkdir -p /var/lib/lxcfs sudo lxcfs /var/lib/lxcfs A container runtime wishing to use `LXCFS` should then bind mount the approriate files into the correct places on container startup. ### LXC In order to use lxcfs with systemd-based containers, you can either use LXC 1.1 in which case it should work automatically, or otherwise, copy the `lxc.mount.hook` and `lxc.reboot.hook` files (once built) from this tree to `/usr/share/lxcfs`, make sure it is executable, then add the following lines to your container configuration: ``` lxc.mount.auto = cgroup:mixed lxc.autodev = 1 lxc.kmsg = 0 lxc.include = /usr/share/lxc/config/common.conf.d/00-lxcfs.conf ``` ### Using with Docker ``` docker run -it -m 256m --memory-swap 256m \ -v /var/lib/lxcfs/proc/cpuinfo:/proc/cpuinfo:rw \ -v /var/lib/lxcfs/proc/diskstats:/proc/diskstats:rw \ -v /var/lib/lxcfs/proc/meminfo:/proc/meminfo:rw \ -v /var/lib/lxcfs/proc/stat:/proc/stat:rw \ -v /var/lib/lxcfs/proc/swaps:/proc/swaps:rw \ -v /var/lib/lxcfs/proc/uptime:/proc/uptime:rw \ -v /var/lib/lxcfs/proc/slabinfo:/proc/slabinfo:rw \ -v /var/lib/lxcfs/sys/devices/system/cpu:/sys/devices/system/cpu:rw \ ubuntu:18.04 /bin/bash ``` In a system with swap enabled, the parameter "-u" can be used to set all values in "meminfo" that refer to the swap to 0. sudo lxcfs -u /var/lib/lxcfs ## Swap handling If you noticed LXCFS not showing any SWAP in your container despite having SWAP on your system, please read this section carefully and look for instructions on how to enable SWAP accounting for your distribution. Swap cgroup handling on Linux is very confusing and there just isn't a perfect way for LXCFS to handle it. Terminology used below: - RAM refers to `memory.usage_in_bytes` and `memory.limit_in_bytes` - RAM+SWAP refers to `memory.memsw.usage_in_bytes` and `memory.memsw.limit_in_bytes` The main issues are: - SWAP accounting is often opt-in and, requiring a special kernel boot time option (`swapaccount=1`) and/or special kernel build options (`CONFIG_MEMCG_SWAP`). - Both a RAM limit and a RAM+SWAP limit can be set. The delta however isn't the available SWAP space as the kernel is still free to SWAP as much of the RAM as it feels like. This makes it impossible to render a SWAP device size as using the delta between RAM and RAM+SWAP for that wouldn't account for the kernel swapping more pages, leading to swap usage exceeding swap total. - It's impossible to disable SWAP in a given container. The closest that can be done is setting swappiness down to 0 which severly limits the risk of swapping pages but doesn't eliminate it. As a result, LXCFS had to make some compromise which go as follow: - When SWAP accounting isn't enabled, no SWAP space is reported at all. This is simply because there is no way to know the SWAP consumption. The container may very much be using some SWAP though, there's just no way to know how much of it and showing a SWAP device would require some kind of SWAP usage to be reported. Showing the host value would be completely wrong, showing a 0 value would be equallty wrong. - Because SWAP usage for a given container can exceed the delta between RAM and RAM+SWAP, the SWAP size is always reported to be the smaller of the RAM+SWAP limit or the host SWAP device itself. This ensures that at no point SWAP usage will be allowed to exceed the SWAP size. - If the swappiness is set to 0 and there is no SWAP usage, no SWAP is reported. However if there is SWAP usage, then a SWAP device of the size of the usage (100% full) is reported. This provides adequate reporting of the memory consumption while preventing applications from assuming more SWAP is available. ## Issue reporting ### Core dump In case of LXCFS crash, it can be extremely useful for us to have a core dump of the LXCFS process memory. 1. Please, check `/var/crash` and `coredumpctl list` just in case if you already have an LXCFS core dump file 2. If not, you can use the following way to collect it from your system: On the machine where you run LXCFS, execute as root: ``` # save an old core_pattern setting value: cat /proc/sys/kernel/core_pattern > /root/core_pattern.old_value.bak # set a new one to collect all core dumps: echo '|/bin/sh -c $@ -- eval exec gzip --fast > /var/crash/core-%e.%p.gz' > /proc/sys/kernel/core_pattern # wait for the next LXCFS crash and check ls -lah /var/crash # there should be a file with a name like "core-lxcfs.80581.gz". Please, upload it somewhere and share with us. # restore the old "core_pattern" value: cat /root/core_pattern.old_value.bak > /proc/sys/kernel/core_pattern ``` ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/0000775000175000017500000000000014773561567014400 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/0000775000175000017500000000000014773561567015343 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/meson.build0000664000175000017500000000161614773561567017511 0ustar00stgraberstgraber# SPDX-License-Identifier: LGPL-2.1-or-later if 'systemd' in init_script systemd = dependency('systemd') systemd_system_unit_dir = systemd.get_pkgconfig_variable('systemdsystemunitdir') systemd_service = custom_target( 'lxcfs.service', input: 'systemd/lxcfs.service.in', output: 'lxcfs.service', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ], install: true, install_dir: systemd_system_unit_dir) endif if 'upstart' in init_script install_data('upstart/lxcfs.conf', install_dir: join_paths(sysconfdir, 'init')) endif if 'openrc' in init_script install_data('sysvinit/lxcfs', install_dir: join_paths(sysconfdir, 'rc.d/init.d')) endif if 'sysvinit' in init_script install_data('sysvinit/lxcfs', install_dir: join_paths(sysconfdir, 'init.d')) endif ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/systemd/0000775000175000017500000000000014773561567017033 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/systemd/lxcfs.service.in0000664000175000017500000000064514773561567022146 0ustar00stgraberstgraber[Unit] Description=FUSE filesystem for LXC ConditionVirtualization=!container Before=lxc.service Documentation=man:lxcfs(1) [Service] OOMScoreAdjust=-1000 ExecStartPre=/bin/mkdir -p {{LXCFSTARGETDIR}} ExecStart=/usr/bin/lxcfs {{LXCFSTARGETDIR}} KillMode=process Restart=on-failure ExecStopPost=-/bin/fusermount -u {{LXCFSTARGETDIR}} Delegate=yes ExecReload=/bin/kill -USR1 $MAINPID [Install] WantedBy=multi-user.target ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/sysvinit/0000775000175000017500000000000014773561567017233 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/sysvinit/lxcfs0000775000175000017500000000504114773561567020300 0ustar00stgraberstgraber#! /bin/sh ### BEGIN INIT INFO # Short-Description: FUSE filesystem for LXC # Description: FUSE filesystem for LXC # Provides: lxcfs # Required-Start: $remote_fs # Required-Stop: $remote_fs # Should-Start: cgroupfs-mount # Should-Stop: cgroupfs-mount # Default-Start: 2 3 4 5 # Default-Stop: 0 1 6 ### END INIT INFO DAEMON=/usr/bin/lxcfs NAME=lxcfs DESC="FUSE filesystem for LXC" PIDFILE=/var/run/lxcfs.pid OOM_SCORE_ADJ="-1000" . /lib/lsb/init-functions test -f ${DAEMON} || exit 0 set -e START="-m --start --quiet --pidfile ${PIDFILE} --name ${NAME} --startas $DAEMON --background" case "$1" in start) if init_is_upstart; then exit 1 fi # Don't start if bind-mounted from host [ ! -d /var/lib/lxcfs/proc ] || exit 0 # Cleanup in case of crash fusermount -u /var/lib/lxcfs 2> /dev/null || true [ -L /etc/mtab ] || \ sed -i "/^lxcfs \/var\/lib\/lxcfs fuse.lxcfs/d" /etc/mtab echo -n "Starting $DESC: " if start-stop-daemon ${START} -- /var/lib/lxcfs >/dev/null 2>&1 ; then echo "${NAME}." echo ${OOM_SCORE_ADJ} > /proc/`cat ${PIDFILE}`/oom_score_adj else if start-stop-daemon --test ${START} >/dev/null 2>&1; then echo "(failed)." exit 1 else echo "${DAEMON} already running." exit 0 fi fi exit 0 ;; stop) if init_is_upstart; then exit 0 fi echo -n "Stopping $DESC: " if start-stop-daemon --stop --quiet --pidfile ${PIDFILE} \ --startas ${DAEMON} --retry 10 --name ${NAME} \ >/dev/null 2>&1 ; then echo "${NAME}." else if start-stop-daemon --test ${START} >/dev/null 2>&1; then echo "(not running)." exit 0 else echo "(failed)." exit 1 fi fi exit 0 ;; status) if init_is_upstart; then exit 0 fi status_of_proc -p ${PIDFILE} "${DAEMON}" lxcfs ;; reload) if init_is_upstart; then exit 1 fi kill -USR1 $(cat ${PIDFILE}) ;; restart|force-reload) if init_is_upstart; then exit 1 fi $0 stop exec $0 start ;; *) echo "Usage: $0 {start|stop|restart|force-reload}" 1>&2 exit 1 ;; esac ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/upstart/0000775000175000017500000000000014773561567017045 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/config/init/upstart/lxcfs.conf0000664000175000017500000000105414773561567021033 0ustar00stgraberstgraberdescription "FUSE filesystem for LXC" author "Stéphane Graber " start on starting lxc or starting lxd or runlevel [2345] stop on runlevel [06] oom score -1000 respawn pre-start script [ ! -e /run/container_type ] || { stop; exit 0; } end script exec /usr/bin/lxcfs /var/lib/lxcfs post-stop script [ -e /run/container_type ] && exit # Cleanup in case of crash fusermount -u /var/lib/lxcfs 2> /dev/null || true [ -L /etc/mtab ] || \ sed -i "/^lxcfs \/var\/lib\/lxcfs fuse.lxcfs/d" /etc/mtab end script ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/doc/0000775000175000017500000000000014773561567013700 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/doc/lxcfs.man.add0000664000175000017500000000030014773561567016234 0ustar00stgraberstgraber[DESCRIPTION] LXCFS is a simple fuse- and cgroup-based filesystem virtualizing various aspects of the system to extend the useability of containers. [SEE ALSO] lxc(1), lxc.container.conf(5) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/lxcfs.spec.in0000664000175000017500000000375614773561567015546 0ustar00stgraberstgraber# Set with_systemd on distros that use it, so we can install the service # file, otherwise the sysvinit script will be installed %if 0%{?fedora} >= 14 || 0%{?rhel} >= 7 || 0%{?suse_version} >= 1210 %global with_systemd 1 %define init_script systemd # # BuildRequires systemd-units on fedora and rhel %if 0%{?fedora} >= 14 || 0%{?rhel} >= 7 BuildRequires: systemd-units %endif # # BuildRequires systemd on openSUSE and SUSE %if 0%{?suse_version} >= 1210 BuildRequires: systemd %endif %else %global with_systemd 0 %define init_script sysvinit %endif Summary: Linux Containers File System Name: {{PROJECT}} Version: {{PROJECT_VERSION}} Release: 1%{?dist} URL: https://linuxcontainers.org/lxcfs/downloads/ Source0: %{name}-%{version}.tar.gz License: LGPL 2.1+ Group: System Environment/Libraries BuildRoot: %{_tmppath}/%{name}-root BuildRequires: gcc BuildRequires: libtool BuildRequires: docbook2X BuildRequires: doxygen BuildRequires: fuse-devel Requires: fuse-libs %description LXCFS is a simple userspace filesystem designed to work around some current limitations of the Linux kernel. %prep %setup -q %build %configure \ --with-init-script=%{init_script} %{make_build} #Modify mount hook command if running on RHEL 7 to skip cgroup mounts for stability reasons. %if 0%{?rhel} == 7 sed -i 's/\/lxc.mount.hook/\/lxc.mount.hook --skip-cgroup-mounts/g' share/00-lxcfs.conf %endif %install [ %{buildroot} != "/" ] && rm -rf %{buildroot} make install DESTDIR=%{buildroot} mkdir -p %{buildroot}/%{_sharedstatedir}/%{name} %clean [ %{buildroot} != "/" ] && rm -rf %{buildroot} %files %defattr(-,root,root,-) %dir %{_sharedstatedir}/%{name} %if %{with_systemd} /lib/systemd/system/%{name}.service %endif %{_bindir}/%{name} %config(noreplace) %{_datarootdir}/lxc/config/common.conf.d/00-%{name}.conf %{_datarootdir}/%{name}/lxc.mount.hook %{_datarootdir}/%{name}/lxc.reboot.hook %{_libdir}/%{name}/liblxcfs.la %{_libdir}/%{name}/liblxcfs.so %changelog * Wed Jan 30 2019 Tom Parrott - 3.1.0 - Initial RPM release. ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/meson.build0000664000175000017500000002103114773561567015272 0ustar00stgraberstgraber# SPDX-License-Identifier: LGPL-2.1-or-later # Project. project( 'lxcfs', 'c', version: '6.0.4', license: 'LGPLv2+', default_options: [ 'b_colorout=always', 'b_asneeded=true', 'b_pie=true', 'c_std=gnu11', 'warning_level=2', ], meson_version: '>= 0.50') cc = meson.get_compiler('c') # Templater. if run_command( 'python3', '-c', 'import jinja2', check: false).returncode() != 0 error('python3 jinja2 missing') endif meson_build_sh = find_program('tools/meson-build.sh') meson_render_jinja2 = find_program('tools/meson-render-jinja2.py') # Configuration options. conf = configuration_data() conf.set_quoted('PROJECT', meson.project_name()) conf.set_quoted('PROJECT_URL', 'https://linuxcontainers.org/lxcfs/') conf.set_quoted('PROJECT_VERSION', meson.project_version()) conf.set_quoted('PACKAGE_VERSION', meson.project_version()) conf.set('_GNU_SOURCE', true) conf.set('_FILE_OFFSET_BITS', 64) conf.set('__STDC_FORMAT_MACROS', true) project_source_root = meson.current_source_dir() project_build_root = meson.current_build_dir() # Path handling. prefixdir = get_option('prefix') bindir = join_paths(prefixdir, get_option('bindir')) libdir = join_paths(prefixdir, get_option('libdir')) lxcfsdir = join_paths(libdir, 'lxcfs') sysconfdir = join_paths(prefixdir, get_option('sysconfdir')) runtimepath = join_paths(prefixdir, get_option('runtime-path')) localstatedir = join_paths('/', get_option('localstatedir')) datadir = join_paths(prefixdir, get_option('datadir')) lxcfssharedir = join_paths(datadir, 'lxcfs') lxcconfdir = join_paths(datadir, 'lxc/config/common.conf.d') lxcmandir = join_paths(datadir, 'man') conf.set_quoted('BINDIR', bindir) conf.set_quoted('LIBDIR', libdir) conf.set_quoted('LOCALSTATEDIR', localstatedir) conf.set_quoted('DEFAULT_RUNTIME_PATH', runtimepath) conf.set_quoted('SYSCONFDIR', sysconfdir) conf.set_quoted('LXCCONFDIR', lxcconfdir) conf.set_quoted('LXCFS_BUILD_ROOT', project_build_root) conf.set_quoted('LXCFSSHAREDIR', lxcfssharedir) conf.set_quoted('LXCFS_SOURCE_ROOT', project_source_root) conf.set_quoted('LXCFSTARGETDIR', join_paths(localstatedir, 'lib/lxcfs')) # Custom configuration. init_script = get_option('init-script') want_tests = get_option('tests') want_docs = get_option('docs') # Build flags. possible_cc_flags = [ '-Wvla', '-Wimplicit-fallthrough=5', '-Wcast-align', '-Wstrict-prototypes', '-fno-strict-aliasing', '-fstack-clash-protection', '-fstack-protector-strong', '--param=ssp-buffer-size=4', '--mcet -fcf-protection', '-Werror=implicit-function-declaration', '-Wlogical-op', '-Wmissing-include-dirs', '-Wold-style-definition', '-Winit-self', '-Wunused-but-set-variable', '-Wno-unused-parameter', '-Wfloat-equal', '-Wsuggest-attribute=noreturn', '-Werror=return-type', '-Werror=incompatible-pointer-types', '-Wformat=2', '-Wshadow', '-Wendif-labels', '-Werror=overflow', '-fdiagnostics-show-option', '-Werror=shift-count-overflow', '-Werror=shift-overflow=2', '-Wdate-time', '-Wnested-externs', '-fasynchronous-unwind-tables', '-fexceptions', '-Warray-bounds', '-Wrestrict', '-Wreturn-local-addr', '-fsanitize=cfi', '-Wstringop-overflow', ] possible_link_flags = [ '-Wl,--gc-sections', '-Wl,-z,relro', '-Wl,-z,now', '-Wl,-fuse-ld=gold', ] if meson.version().version_compare('>=0.46') add_project_link_arguments(cc.get_supported_link_arguments(possible_link_flags), language: 'c') else add_project_link_arguments(possible_link_flags, language: 'c') endif add_project_arguments(cc.get_supported_arguments(possible_cc_flags), language: 'c') # Feature detection. foreach ident: [ ['strlcpy', '''#include '''], ['strlcat', '''#include '''], ['pidfd_send_signal', '''#include #include #include #include '''], ['pidfd_open', '''#include #include #include #include '''], ] have = cc.has_function(ident[0], prefix: ident[1], args: '-D_GNU_SOURCE') conf.set10('HAVE_' + ident[0].to_upper(), have) endforeach fuse_version = get_option('fuse-version') if fuse_version == '3' or fuse_version == 'auto' libfuse = dependency('fuse3', required: false) if libfuse.found() conf.set10('HAVE_FUSE3', true) conf.set('FUSE_USE_VERSION', 35) if libfuse.version().version_compare('>=3.10.3') conf.set10('HAVE_FUSE_RETURNS_DT_TYPE', true) else conf.set10('HAVE_FUSE_RETURNS_DT_TYPE', false) endif endif endif if fuse_version == '2' or (not libfuse.found() and fuse_version == 'auto') libfuse = dependency('fuse', version: '>= 2.6') if libfuse.found() conf.set10('HAVE_FUSE', true) conf.set('FUSE_USE_VERSION', 26) conf.set10('HAVE_FUSE_RETURNS_DT_TYPE', true) endif endif if not libfuse.found() error('no usable fuse version found') endif libdl = cc.find_library('dl') threads = dependency('threads') config_h = configure_file( output: 'config.h', configuration: conf) config_include = include_directories('.') add_project_arguments('-include', 'config.h', language: 'c') # Binary. lxcfs_sources = files('src/lxcfs.c', 'src/utils.c') lxcfs = executable( 'lxcfs', lxcfs_sources, dependencies: [ threads, libdl, libfuse, ], install: true, install_dir: bindir) # Library. liblxcfs_sources = files( 'src/api_extensions.h', 'src/bindings.c', 'src/bindings.h', 'src/cgroups/cgfsng.c', 'src/cgroups/cgroup.c', 'src/cgroups/cgroup.h', 'src/cgroups/cgroup_utils.c', 'src/cgroups/cgroup_utils.h', 'src/cgroup_fuse.c', 'src/cgroup_fuse.h', 'src/cpuset_parse.c', 'src/cpuset_parse.h', 'src/lxcfs.c', 'src/lxcfs_fuse.h', 'src/lxcfs_fuse_compat.h', 'src/macro.h', 'src/memory_utils.h', 'src/proc_cpuview.c', 'src/proc_cpuview.h', 'src/proc_fuse.c', 'src/proc_fuse.h', 'src/proc_loadavg.c', 'src/proc_loadavg.h', 'src/syscall_numbers.h', 'src/sysfs_fuse.c', 'src/sysfs_fuse.h', 'src/utils.c', 'src/utils.h') liblxcfs_common_dependencies = declare_dependency( sources: liblxcfs_sources, dependencies: [ threads, libfuse, ]) liblxcfs = shared_module( 'lxcfs', liblxcfs_sources, dependencies: liblxcfs_common_dependencies, install: true, install_dir: lxcfsdir) # Tests. test_programs = [] if want_tests == true liblxcfs_test = shared_module( 'lxcfstest', liblxcfs_sources, dependencies: liblxcfs_common_dependencies, install: false, install_dir: lxcfsdir, c_args: '-DRELOADTEST -DDEBUG') endif # RPM spec. lxcfs_spec = custom_target( 'lxcfs.spec', build_by_default: true, input: 'lxcfs.spec.in', output: 'lxcfs.spec', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) # Man pages if want_docs == true help2man = find_program('help2man') help2man_opts = [ '--name="System virtualization filesystem for containers"', '--no-discard-stderr', '--section=1', '--opt-include=docs/lxcfs.man.add', '--no-info', ] custom_target('lxcfs.1', output: 'lxcfs.1', command: [help2man, help2man_opts, '--output=@OUTPUT@', lxcfs], install: true, install_dir: join_paths(lxcmandir, 'man1')) endif # Include sub-directories. subdir('config/init') subdir('share') subdir('tests') # Build overview. status = [ '@0@ @1@'.format(meson.project_name(), meson.project_version()), 'FUSE version: @0@'.format(libfuse.version()), 'bin directory: @0@'.format(bindir), 'lib directory: @0@'.format(libdir), 'data directory: @0@'.format(datadir), 'local state directory: @0@'.format(localstatedir), 'prefix directory: @0@'.format(prefixdir), 'runtime directory: @0@'.format(runtimepath), 'sysconf directory: @0@'.format(sysconfdir), 'lxc conf directory: @0@'.format(lxcconfdir), 'lxcfs directory: @0@'.format(lxcfsdir), 'lxcfs shared directory: @0@'.format(lxcfssharedir), 'lxcfs build root directory: @0@'.format(project_build_root), 'lxcfs source root directory: @0@'.format(project_source_root), 'init system(s): @0@'.format(init_script), 'tests: @0@'.format(want_tests), 'documentation: @0@'.format(want_docs), ] message('\n '.join(status)) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/meson_options.txt0000664000175000017500000000131014773561567016563 0ustar00stgraberstgraber# -*- mode: meson -*- option('tests', type : 'boolean', value: 'false', description : 'enable tests') option('runtime-path', type : 'string', value : '/run', description : 'the runtime directory') option('with-init-script', type : 'string', value : 'systemd', description : 'the runtime directory') option('init-script', type : 'array', choices : ['systemd', 'sysvinit', 'openrc', 'upstart'], value : ['systemd'], description : 'init script') option('docs', type : 'boolean', value: 'true', description : 'build documentation') option('fuse-version', type : 'combo', choices : ['auto', '2', '3'], value : 'auto', description : 'fuse version to use') ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/share/0000775000175000017500000000000014773561567014235 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/share/00-lxcfs.conf.in0000664000175000017500000000015114773561567017042 0ustar00stgraberstgraberlxc.hook.mount = {{LXCFSSHAREDIR}}/lxc.mount.hook lxc.hook.post-stop = {{LXCFSSHAREDIR}}/lxc.reboot.hook ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/share/lxc.mount.hook.in0000775000175000017500000000714114773561567017461 0ustar00stgraberstgraber#!/bin/sh -e # Parse command flags while [ ! $# -eq 0 ] do case "$1" in --skip-cgroup-mounts ) SKIP_CGROUP_MOUNTS=1 ;; esac shift done # We're dealing with mount entries, so expand any symlink LXC_ROOTFS_MOUNT=$(readlink -f "${LXC_ROOTFS_MOUNT}") # /proc files if [ -d {{LXCFSTARGETDIR}}/proc/ ]; then for entry in {{LXCFSTARGETDIR}}/proc/*; do DEST=$(basename "$entry") [ -e "${LXC_ROOTFS_MOUNT}/proc/${DEST}" ] || continue mount -n --bind "$entry" "${LXC_ROOTFS_MOUNT}/proc/${DEST}" done fi # /sys/devices/system/cpu if [ -d {{LXCFSTARGETDIR}}/sys/devices/system/cpu ] && [ -d "${LXC_ROOTFS_MOUNT}/sys/devices/system/cpu" ]; then if [ -f {{LXCFSTARGETDIR}}/sys/devices/system/cpu/uevent ]; then mount -n --bind {{LXCFSTARGETDIR}}/sys/devices/system/cpu "${LXC_ROOTFS_MOUNT}/sys/devices/system/cpu" else for entry in {{LXCFSTARGETDIR}}/sys/devices/system/cpu/*; do DEST=$(basename "$entry") [ -e "${LXC_ROOTFS_MOUNT}/sys/devices/system/cpu/${DEST}" ] || continue mount -n --bind "$entry" "${LXC_ROOTFS_MOUNT}/sys/devices/system/cpu/${DEST}" done fi fi # Allow nesting lxcfs if [ -d "${LXC_ROOTFS_MOUNT}{{LXCFSTARGETDIR}}/" ]; then mount -n --bind {{LXCFSTARGETDIR}} "${LXC_ROOTFS_MOUNT}{{LXCFSTARGETDIR}}/" fi # no need for lxcfs cgroups if we have cgroup namespaces [ -n "$LXC_CGNS_AWARE" ] && [ -f /proc/self/ns/cgroup ] && exit 0 # Don't mess with containers that don't have /sys/fs/cgroup configured # (lxc.mount.auto = cgroup:mixed) if touch "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/lxcfs"; then rm "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/lxcfs" else exit 0 fi # Skip mounting cgroup tree if requested. if [ "${SKIP_CGROUP_MOUNTS}" = "1" ]; then exit 0 fi # /sys/fs/cgroup files if [ -d "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup" ]; then if [ -d {{LXCFSTARGETDIR}}/cgroup ]; then # Cleanup existing mounts for entry in "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup"/*; do DEST=$(basename "$entry") if [ "${DEST}" = "cgmanager" ]; then continue fi if [ ! -d "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${DEST}" ]; then continue fi while grep -q "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${DEST}" /proc/self/mountinfo; do grep "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${DEST}" /proc/self/mountinfo | cut -d' ' -f5 | while read -r line; do [ -e "${line}" ] || continue umount -l "${line}" || true done done rm -Rf "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${DEST}" done # Mount the new entries for entry in {{LXCFSTARGETDIR}}/cgroup/*; do DEST=$(basename "$entry") if [ "$DEST" = "name=systemd" ]; then DEST="systemd" fi if [ ! -d "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${DEST}" ]; then mkdir "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${DEST}" fi mount -n --bind "$entry" "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${DEST}" # make sure that cpu,cpuacct shows up as cpu and cpuacct # separately, else systemd is unhappy if echo "$DEST" | grep -q ","; then arr=$(echo "$DEST" | tr "," "\n") for single in $arr do if [ ! -L "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${single}" ]; then ln -s "$DEST" "${LXC_ROOTFS_MOUNT}/sys/fs/cgroup/${single}" fi done fi done fi fi exit 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/share/lxc.reboot.hook.in0000775000175000017500000000003214773561567017601 0ustar00stgraberstgraber#!/bin/sh -eu sleep 0.5s ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/share/meson.build0000664000175000017500000000156614773561567016407 0ustar00stgraberstgraber# SPDX-License-Identifier: LGPL-2.1-or-later lxcfs_conf_data = custom_target( '00-lxcfs.conf', input: '00-lxcfs.conf.in', output: '00-lxcfs.conf', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ], install: true, install_dir: lxcconfdir) lxcfs_hook_mount_data = custom_target( 'lxc.mount.hook', input: 'lxc.mount.hook.in', output: 'lxc.mount.hook', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ], install: true, install_dir: lxcfssharedir) lxcfs_hook_reboot_data = custom_target( 'lxc.reboot.hook', input: 'lxc.reboot.hook.in', output: 'lxc.reboot.hook', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ], install: true, install_dir: lxcfssharedir) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/0000775000175000017500000000000014773561567013722 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/api_extensions.h0000664000175000017500000000121514773561567017122 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_API_EXTENSIONS_H #define __LXCFS_API_EXTENSIONS_H #include "config.h" #include #include /* * api_extensions is the list of all API extensions in the order they were * added. */ static char *api_extensions[] = { "cgroups", "sys_cpu_online", "proc_cpuinfo", "proc_diskstats", "proc_loadavg", "proc_meminfo", "proc_stat", "proc_swaps", "proc_uptime", "proc_slabinfo", "shared_pidns", "cpuview_daemon", "loadavg_daemon", "pidfds", }; static size_t nr_api_extensions = sizeof(api_extensions) / sizeof(*api_extensions); #endif /* __LXCFS_API_EXTENSIONS_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/bindings.c0000664000175000017500000006210714773561567015671 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bindings.h" #include "api_extensions.h" #include "cgroup_fuse.h" #include "cgroups/cgroup.h" #include "cgroups/cgroup_utils.h" #include "memory_utils.h" #include "proc_cpuview.h" #include "syscall_numbers.h" #include "utils.h" /* directory under which we mount the controllers - /run/lxcfs/controllers */ #define BASEDIR "/lxcfs/controllers" #define ROOTDIR "/lxcfs/root" static bool can_use_pidfd; static bool can_use_swap; static bool can_use_sys_cpu; static bool has_versioned_opts; static bool memory_is_cgroupv2; static __u32 host_personality; static char runtime_path[PATH_MAX] = DEFAULT_RUNTIME_PATH; static volatile sig_atomic_t reload_successful; static char *get_base_dir(void) { return must_make_path(runtime_path, BASEDIR, NULL); } static char *get_root_dir(void) { return must_make_path(runtime_path, ROOTDIR, NULL); } bool liblxcfs_functional(void) { return reload_successful != 0; } bool liblxcfs_can_use_swap(void) { return can_use_swap; } bool liblxcfs_can_use_sys_cpu(void) { return can_use_sys_cpu; } bool liblxcfs_has_versioned_opts(void) { return has_versioned_opts; } bool liblxcfs_memory_is_cgroupv2(void) { return memory_is_cgroupv2; } __u32 liblxcfs_personality(void) { return host_personality; } /* Define pivot_root() if missing from the C library */ #ifndef HAVE_PIVOT_ROOT static int pivot_root(const char *new_root, const char *put_old) { return syscall(__NR_pivot_root, new_root, put_old); } #else extern int pivot_root(const char *new_root, const char *put_old); #endif /* * A table caching which pid is init for a pid namespace. * When looking up which pid is init for $qpid, we first * 1. Stat /proc/$qpid/ns/pid. * 2. Check whether the ino_t is in our store. * a. if not, fork a child in qpid's ns to send us * ucred.pid = 1, and read the initpid. Cache * initpid and creation time for /proc/initpid * in a new store entry. * b. if so, verify that /proc/initpid still matches * what we have saved. If not, clear the store * entry and go back to a. If so, return the * cached initpid. */ struct pidns_init_store { ino_t ino; /* inode number for /proc/$pid/ns/pid */ pid_t initpid; /* the pid of nit in that ns */ int init_pidfd; int64_t ctime; /* the time at which /proc/$initpid was created */ struct pidns_init_store *next; int64_t lastcheck; }; /* lol - look at how they are allocated in the kernel */ #define PIDNS_HASH_SIZE 4096 #define HASH(x) ((x) % PIDNS_HASH_SIZE) static struct pidns_init_store *pidns_hash_table[PIDNS_HASH_SIZE]; static pthread_mutex_t pidns_store_mutex = PTHREAD_MUTEX_INITIALIZER; static void mutex_lock(pthread_mutex_t *l) { int ret; ret = pthread_mutex_lock(l); if (ret) log_exit("%s - returned %d\n", strerror(ret), ret); } struct cgroup_ops *cgroup_ops; static void mutex_unlock(pthread_mutex_t *l) { int ret; ret = pthread_mutex_unlock(l); if (ret) log_exit("%s - returned %d\n", strerror(ret), ret); } static inline void store_lock(void) { mutex_lock(&pidns_store_mutex); } static inline void store_unlock(void) { mutex_unlock(&pidns_store_mutex); } #define define_interruptible_lock(type, lockname, lockfn) \ int lockname##_interruptible(type *l) \ { \ int ret = ETIMEDOUT; \ while (!fuse_interrupted() && (ret == ETIMEDOUT)) { \ struct timespec deadline; \ clock_gettime(CLOCK_REALTIME, &deadline); \ deadline.tv_sec += 1; \ ret = lockfn(l, &deadline); \ } \ return -ret; \ } define_interruptible_lock(pthread_mutex_t, mutex_lock, pthread_mutex_timedlock) define_interruptible_lock(pthread_rwlock_t, rwlock_rdlock, pthread_rwlock_timedrdlock) define_interruptible_lock(pthread_rwlock_t, rwlock_wrlock, pthread_rwlock_timedwrlock) #undef define_interruptible_lock /* /proc/ = 6 * + * = INTTYPE_TO_STRLEN(pid_t) * + * \0 = 1 */ #define LXCFS_PROC_PID_LEN \ (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + +1) static int initpid_still_valid_pidfd(struct pidns_init_store *entry) { int ret; if (entry->init_pidfd < 0) return ret_errno(ENOSYS); ret = pidfd_send_signal(entry->init_pidfd, 0, NULL, 0); if (ret < 0) { if (errno == ENOSYS) return ret_errno(ENOSYS); return 0; } return 1; } static int initpid_still_valid_stat(struct pidns_init_store *entry) { struct stat st; char path[LXCFS_PROC_PID_LEN]; snprintf(path, sizeof(path), "/proc/%d", entry->initpid); if (stat(path, &st) || entry->ctime != st.st_ctime) return 0; return 1; } /* Must be called under store_lock */ static bool initpid_still_valid(struct pidns_init_store *entry) { int ret; ret = initpid_still_valid_pidfd(entry); if (ret < 0) ret = initpid_still_valid_stat(entry); return ret == 1; } /* Must be called under store_lock */ static void remove_initpid(struct pidns_init_store *entry) { struct pidns_init_store *it; int ino_hash; lxcfs_debug("Removing cached entry for pid %d from init pid cache", entry->initpid); ino_hash = HASH(entry->ino); if (pidns_hash_table[ino_hash] == entry) { pidns_hash_table[ino_hash] = entry->next; close_prot_errno_disarm(entry->init_pidfd); free_disarm(entry); return; } it = pidns_hash_table[ino_hash]; while (it) { if (it->next == entry) { it->next = entry->next; close_prot_errno_disarm(entry->init_pidfd); free_disarm(entry); return; } it = it->next; } } #define PURGE_SECS 5 /* Must be called under store_lock */ static void prune_initpid_store(void) { static int64_t last_prune = 0; int64_t now, threshold; if (!last_prune) { last_prune = time(NULL); return; } now = time(NULL); if (now < (last_prune + PURGE_SECS)) return; lxcfs_debug("Pruning init pid cache"); last_prune = now; threshold = now - 2 * PURGE_SECS; for (int i = 0; i < PIDNS_HASH_SIZE; i++) { for (struct pidns_init_store *entry = pidns_hash_table[i], *prev = NULL; entry;) { if (entry->lastcheck < threshold) { struct pidns_init_store *cur = entry; lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); if (prev) prev->next = entry->next; else pidns_hash_table[i] = entry->next; entry = entry->next; close_prot_errno_disarm(cur->init_pidfd); free_disarm(cur); } else { prev = entry; entry = entry->next; } } } } static void clear_initpid_store(void) { store_lock(); for (int i = 0; i < PIDNS_HASH_SIZE; i++) { for (struct pidns_init_store *entry = pidns_hash_table[i]; entry;) { struct pidns_init_store *cur = entry; lxcfs_debug("Removed cache entry for pid %d to init pid cache", cur->initpid); pidns_hash_table[i] = entry->next; entry = entry->next; close_prot_errno_disarm(cur->init_pidfd); free_disarm(cur); } } store_unlock(); } /* Must be called under store_lock */ static void save_initpid(ino_t pidns_inode, pid_t pid) { __do_free struct pidns_init_store *entry = NULL; __do_close int pidfd = -EBADF; const struct lxcfs_opts *opts = fuse_get_context()->private_data; char path[LXCFS_PROC_PID_LEN]; struct stat st; int ino_hash; if (opts && opts->use_pidfd && can_use_pidfd) { pidfd = pidfd_open(pid, 0); if (pidfd < 0) return; } snprintf(path, sizeof(path), "/proc/%d", pid); if (stat(path, &st)) return; entry = zalloc(sizeof(*entry)); if (!entry) return; ino_hash = HASH(pidns_inode); *entry = (struct pidns_init_store){ .ino = pidns_inode, .initpid = pid, .ctime = st.st_ctime, .next = pidns_hash_table[ino_hash], .lastcheck = time(NULL), .init_pidfd = move_fd(pidfd), }; pidns_hash_table[ino_hash] = move_ptr(entry); lxcfs_debug("Added cache entry %d for pid %d to init pid cache", ino_hash, pid); } /* * Given the stat(2) info for a nsfd pid inode, lookup the init_pid_store * entry for the inode number and creation time. Verify that the init pid * is still valid. If not, remove it. Return the entry if valid, NULL * otherwise. * Must be called under store_lock */ static pid_t lookup_verify_initpid(ino_t pidns_inode) { struct pidns_init_store *entry = pidns_hash_table[HASH(pidns_inode)]; while (entry) { if (entry->ino == pidns_inode) { if (initpid_still_valid(entry)) { entry->lastcheck = time(NULL); return entry->initpid; } remove_initpid(entry); return ret_errno(ESRCH); } entry = entry->next; } return ret_errno(ESRCH); } static bool send_creds_ok(int sock_fd) { char v = '1'; /* we are the child */ struct ucred cred = { .uid = 0, .gid = 0, .pid = 1, }; return send_creds(sock_fd, &cred, v, true) == SEND_CREDS_OK; } __returns_twice pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd) { /* * These flags don't interest at all so we don't jump through any hoops * of retrieving them and passing them to the kernel. */ errno = EINVAL; if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID | CLONE_SETTLS))) return -EINVAL; #if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) /* On s390/s390x and cris the order of the first and second arguments * of the system call is reversed. */ return syscall(__NR_clone, NULL, flags | SIGCHLD, pidfd); #elif defined(__sparc__) && defined(__arch64__) { /* * sparc64 always returns the other process id in %o0, and a * boolean flag whether this is the child or the parent in %o1. * Inline assembly is needed to get the flag returned in %o1. */ register long g1 asm("g1") = __NR_clone; register long o0 asm("o0") = flags | SIGCHLD; register long o1 asm("o1") = 0; /* is parent/child indicator */ register long o2 asm("o2") = (unsigned long)pidfd; long is_error, retval, in_child; pid_t child_pid; asm volatile( #if defined(__arch64__) "t 0x6d\n\t" /* 64-bit trap */ #else "t 0x10\n\t" /* 32-bit trap */ #endif /* * catch errors: On sparc, the carry bit (csr) in the * processor status register (psr) is used instead of a * full register. */ "addx %%g0, 0, %%g1" : "=r"(g1), "=r"(o0), "=r"(o1), "=r"(o2) /* outputs */ : "r"(g1), "r"(o0), "r"(o1), "r"(o2) /* inputs */ : "%cc"); /* clobbers */ is_error = g1; retval = o0; in_child = o1; if (is_error) { errno = retval; return -1; } if (in_child) return 0; child_pid = retval; return child_pid; } #elif defined(__ia64__) /* On ia64 the stack and stack size are passed as separate arguments. */ return syscall(__NR_clone, flags | SIGCHLD, NULL, 0, pidfd); #else return syscall(__NR_clone, flags | SIGCHLD, NULL, pidfd); #endif } #define LXCFS_PROC_PID_NS_LEN \ (STRLITERALLEN("/proc/") + INTTYPE_TO_STRLEN(uint64_t) + \ STRLITERALLEN("/ns/pid") + 1) /* * clone a task which switches to @task's namespace and writes '1'. * over a unix sock so we can read the task's reaper's pid in our * namespace * * Note: glibc's fork() does not respect pidns, which can lead to failed * assertions inside glibc (and thus failed forks) if the child's pid in * the pidns and the parent pid outside are identical. Using clone prevents * this issue. */ static void write_task_init_pid_exit(int sock, pid_t target) { __do_close int fd = -EBADF; char path[LXCFS_PROC_PID_NS_LEN]; pid_t pid; snprintf(path, sizeof(path), "/proc/%d/ns/pid", (int)target); fd = open(path, O_RDONLY | O_CLOEXEC); if (fd < 0) log_exit("write_task_init_pid_exit open of ns/pid"); if (setns(fd, 0)) log_exit("Failed to setns to pid namespace of process %d", target); pid = lxcfs_raw_clone(0, NULL); if (pid < 0) _exit(EXIT_FAILURE); if (pid == 0) { if (!send_creds_ok(sock)) _exit(EXIT_FAILURE); _exit(EXIT_SUCCESS); } if (!wait_for_pid(pid)) _exit(EXIT_FAILURE); _exit(EXIT_SUCCESS); } static pid_t scm_init_pid(pid_t task) { char v = '0'; pid_t pid_ret = -1; struct ucred cred = { .pid = -1, .uid = -1, .gid = -1, }; pid_t pid; int sock[2]; if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) return -1; pid = fork(); if (pid < 0) goto out; if (pid == 0) { close(sock[1]); write_task_init_pid_exit(sock[0], task); _exit(EXIT_SUCCESS); } if (!recv_creds(sock[1], &cred, &v)) goto out; pid_ret = cred.pid; out: close(sock[0]); close(sock[1]); if (pid > 0) wait_for_pid(pid); return pid_ret; } pid_t lookup_initpid_in_store(pid_t pid) { pid_t hashed_pid = 0; char path[LXCFS_PROC_PID_NS_LEN]; struct stat st; snprintf(path, sizeof(path), "/proc/%d/ns/pid", pid); if (stat(path, &st)) return ret_errno(ESRCH); store_lock(); hashed_pid = lookup_verify_initpid(st.st_ino); if (hashed_pid < 0) { /* release the mutex as the following call is expensive */ store_unlock(); hashed_pid = scm_init_pid(pid); store_lock(); if (hashed_pid > 0) save_initpid(st.st_ino, hashed_pid); } /* * Prune at the end in case we're pruning the value * we were about to return. */ prune_initpid_store(); store_unlock(); return hashed_pid; } /* * Functions needed to setup cgroups in the __constructor__. */ static bool umount_if_mounted(void) { __do_free char *base_dir = get_base_dir(); if (umount2(base_dir, MNT_DETACH) < 0 && errno != EINVAL) { lxcfs_error("Failed to unmount %s: %s.\n", base_dir, strerror(errno)); return false; } return true; } /* __typeof__ should be safe to use with all compilers. */ typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; static bool has_fs_type(const struct statfs *fs, fs_type_magic magic_val) { return (fs->f_type == (fs_type_magic)magic_val); } /* * looking at fs/proc_namespace.c, it appears we can * actually expect the rootfs entry to very specifically contain * " - rootfs rootfs " * IIUC, so long as we've chrooted so that rootfs is not our root, * the rootfs entry should always be skipped in mountinfo contents. */ static bool is_on_ramfs(void) { __do_free char *line = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; size_t len = 0; f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); if (!f) return false; while (getline(&line, &len, f) != -1) { int i; char *p, *p2; for (p = line, i = 0; p && i < 4; i++) p = strchr(p + 1, ' '); if (!p) continue; p2 = strchr(p + 1, ' '); if (!p2) continue; *p2 = '\0'; if (strcmp(p + 1, "/") == 0) { /* This is '/'. Is it the ramfs? */ p = strchr(p2 + 1, '-'); if (p && strncmp(p, "- rootfs rootfs ", 16) == 0) return true; } } return false; } static int pivot_enter(void) { __do_close int oldroot = -EBADF, newroot = -EBADF; __do_free char *root_dir = get_root_dir(); oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); if (oldroot < 0) return log_error_errno(-1, errno, "Failed to open old root for fchdir"); newroot = open(root_dir, O_DIRECTORY | O_RDONLY | O_CLOEXEC); if (newroot < 0) return log_error_errno(-1, errno, "Failed to open new root for fchdir"); /* change into new root fs */ if (fchdir(newroot) < 0) return log_error_errno(-1, errno, "Failed to change directory to new rootfs: %s", root_dir); /* pivot_root into our new root fs */ if (pivot_root(".", ".") < 0) return log_error_errno(-1, errno, "pivot_root() syscall failed: %s", strerror(errno)); /* * At this point the old-root is mounted on top of our new-root. * To unmounted it we must not be chdir'd into it, so escape back * to the old-root. */ if (fchdir(oldroot) < 0) return log_error_errno(-1, errno, "Failed to enter old root"); if (umount2(".", MNT_DETACH) < 0) return log_error_errno(-1, errno, "Failed to detach old root"); if (fchdir(newroot) < 0) return log_error_errno(-1, errno, "Failed to re-enter new root"); return 0; } static int chroot_enter(void) { __do_free char *root_dir = get_root_dir(); if (mount(root_dir, "/", NULL, MS_REC | MS_BIND, NULL)) { lxcfs_error("Failed to recursively bind-mount %s into /.", root_dir); return -1; } if (chroot(".") < 0) { lxcfs_error("Call to chroot() failed: %s.\n", strerror(errno)); return -1; } if (chdir("/") < 0) { lxcfs_error("Failed to change directory: %s.\n", strerror(errno)); return -1; } return 0; } static int permute_and_enter(void) { struct statfs sb; if (statfs("/", &sb) < 0) { lxcfs_error("%s\n", "Could not stat / mountpoint."); return -1; } /* has_fs_type() is not reliable. When the ramfs is a tmpfs it will * likely report TMPFS_MAGIC. Hence, when it reports no we still check * /proc/1/mountinfo. */ if (has_fs_type(&sb, RAMFS_MAGIC) || is_on_ramfs()) return chroot_enter(); if (pivot_enter() < 0) { lxcfs_error("%s\n", "Could not perform pivot root."); return -1; } return 0; } /* Prepare our new clean root. */ static int permute_prepare(void) { __do_free char *base_dir = get_base_dir(); __do_free char *root_dir = get_root_dir(); __do_free char *new_runtime = must_make_path(root_dir, runtime_path, NULL); __do_free char *new_base_dir = must_make_path(root_dir, base_dir, NULL); if (mkdir(root_dir, 0700) < 0 && errno != EEXIST) { lxcfs_error("%s\n", "Failed to create directory for new root."); return -1; } if (mount("/", root_dir, NULL, MS_BIND, 0) < 0) { lxcfs_error("Failed to bind-mount / for new root: %s.\n", strerror(errno)); return -1; } if (!mkdir_p(new_runtime, 0700)) { lxcfs_error("Failed to create dir %s\n", new_runtime); return -1; } if (mount(runtime_path, new_runtime, NULL, MS_BIND, 0) < 0) { lxcfs_error("Failed to bind-mount %s into new root: %s.\n", runtime_path, strerror(errno)); return -1; } if (mount(base_dir, new_base_dir, NULL, MS_REC | MS_MOVE, 0) < 0) { printf("Failed to move %s into new root: %s.\n", base_dir, strerror(errno)); return -1; } return 0; } /* Calls chroot() on ramfs, pivot_root() in all other cases. */ static bool permute_root(void) { /* Prepare new root. */ if (permute_prepare() < 0) return false; /* Pivot into new root. */ if (permute_and_enter() < 0) return false; return true; } static bool cgfs_prepare_mounts(void) { __do_free char *base_dir = get_base_dir(); if (!mkdir_p(base_dir, 0700)) { lxcfs_error("%s\n", "Failed to create lxcfs cgroup mountpoint."); return false; } if (!umount_if_mounted()) { lxcfs_error("%s\n", "Failed to clean up old lxcfs cgroup mountpoint."); return false; } if (unshare(CLONE_NEWNS) < 0) { lxcfs_error("Failed to unshare mount namespace: %s.\n", strerror(errno)); return false; } cgroup_ops->mntns_fd = preserve_ns(getpid(), "mnt"); if (cgroup_ops->mntns_fd < 0) { lxcfs_error("Failed to preserve mount namespace: %s.\n", strerror(errno)); return false; } if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0) < 0) { lxcfs_error("Failed to remount / private: %s.\n", strerror(errno)); return false; } if (mount("tmpfs", base_dir, "tmpfs", 0, "size=100000,mode=700") < 0) { lxcfs_error("%s\n", "Failed to mount tmpfs over lxcfs cgroup mountpoint."); return false; } return true; } static bool cgfs_mount_hierarchies(void) { __do_free char *base_dir = get_base_dir(); __do_free char *base_dir_cgroup_mount = must_make_path(base_dir, DEFAULT_CGROUP_MOUNTPOINT, NULL); if (!mkdir_p(base_dir_cgroup_mount, 0700)) return false; if (!cgroup_ops->mount(cgroup_ops, base_dir)) return false; for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { __do_free char *path = must_make_path(base_dir, (*h)->mountpoint, NULL); (*h)->fd = open(path, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW); if ((*h)->fd < 0) return false; } return true; } static bool cgfs_setup_controllers(void) { if (!cgfs_prepare_mounts()) return false; if (!cgfs_mount_hierarchies()) return log_error_errno(false, errno, "Failed to set up private lxcfs cgroup mounts"); if (!permute_root()) return false; return true; } static void sigusr2_toggle_virtualization(int signo, siginfo_t *info, void *extra) { int ret; if (reload_successful) { reload_successful = 0; /* write() is async signal safe */ ret = write(STDERR_FILENO, "Switched into non-virtualization mode\n", STRLITERALLEN("Switched into non-virtualization mode\n")); if (ret < 0) goto please_compiler; } else { reload_successful = 1; /* write() is async signal safe */ ret = write(STDERR_FILENO, "Switched into virtualization mode\n", STRLITERALLEN("Switched into virtualization mode\n")); if (ret < 0) goto please_compiler; } please_compiler: /* * The write() syscall is a function whose return value needs to be * checked. Otherwise the compiler will warn.Another one could be to * use syscall(__NR_write, ...) directly but whatever. */ return; } bool set_runtime_path(const char* new_path) { if (new_path && strlen(new_path) < PATH_MAX) { strlcpy(runtime_path, new_path, sizeof(runtime_path)); lxcfs_info("Using runtime path %s", runtime_path); return true; } else { lxcfs_error("%s\n", "Failed to overwrite the runtime path"); return false; } } void lxcfslib_init(void) { __do_close int init_ns = -EBADF, root_fd = -EBADF, pidfd = -EBADF; __do_free char *cgroup = NULL; int i = 0; pid_t pid; struct hierarchy *hierarchy; lxcfs_info("Running %s to reload liblxcfs", __func__); cgroup_ops = cgroup_init(); if (!cgroup_ops) { lxcfs_info("Failed to initialize cgroup support"); goto broken_upgrade; } /* Preserve initial namespace. */ pid = getpid(); init_ns = preserve_ns(pid, "mnt"); if (init_ns < 0) { lxcfs_info("Failed to preserve initial mount namespace"); goto broken_upgrade; } /* This function calls unshare(CLONE_NEWNS) our initial mount namespace * to privately mount lxcfs cgroups. */ if (!cgfs_setup_controllers()) { log_exit("Failed to setup private cgroup mounts for lxcfs"); goto broken_upgrade; } if (setns(init_ns, 0) < 0) { log_exit("%s - Failed to switch back to initial mount namespace", strerror(errno)); goto broken_upgrade; } if (!init_cpuview()) { log_exit("Failed to init CPU view"); goto broken_upgrade; } lxcfs_info("mount namespace: %d", cgroup_ops->mntns_fd); lxcfs_info("hierarchies:"); for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++, i++) { char **controller_list = (*h)->controllers; __do_free char *controllers = NULL; if (controller_list && *controller_list) controllers = lxc_string_join(",", (const char **)controller_list, false); lxcfs_info(" %2d: fd: %3d: %s", i, (*h)->fd, controllers ?: ""); } pidfd = pidfd_open(pid, 0); if (pidfd >= 0 && pidfd_send_signal(pidfd, 0, NULL, 0) == 0) { can_use_pidfd = true; lxcfs_info("Kernel supports pidfds"); } cgroup = get_pid_cgroup(pid, "memory"); can_use_swap = cgroup && cgroup_ops->can_use_swap(cgroup_ops, cgroup); if (can_use_swap) lxcfs_info("Kernel supports swap accounting"); else lxcfs_info("Kernel does not support swap accounting"); hierarchy = cgroup_ops->get_hierarchy(cgroup_ops, "memory"); memory_is_cgroupv2 = hierarchy && is_unified_hierarchy(hierarchy); lxcfs_info("api_extensions:"); for (size_t nr = 0; nr < nr_api_extensions; nr++) lxcfs_info("- %s", api_extensions[nr]); root_fd = open("/", O_PATH | O_CLOEXEC); if (root_fd < 0) lxcfs_info("%s - Failed to open root directory", strerror(errno)); else if (fchdir(root_fd) < 0) lxcfs_info("%s - Failed to change to root directory", strerror(errno)); if (install_signal_handler(SIGUSR2, sigusr2_toggle_virtualization)) { lxcfs_info("%s - Failed to install SIGUSR2 signal handler", strerror(errno)); goto broken_upgrade; } if (get_task_personality(getpid(), &host_personality) < 0) { lxcfs_info("Failed to retrieve host personality"); goto broken_upgrade; } reload_successful = 1; return; broken_upgrade: reload_successful = 0; lxcfs_info("Failed to run constructor %s to reload liblxcfs", __func__); } static void __attribute__((destructor)) lxcfs_exit(void) { lxcfs_info("Running destructor %s", __func__); clear_initpid_store(); free_cpuview(); cgroup_exit(cgroup_ops); } void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data) { struct fuse_context *fc = fuse_get_context(); struct lxcfs_opts *opts = fc ? fc->private_data : NULL; #if HAVE_FUSE_RETURNS_DT_TYPE can_use_sys_cpu = true; #endif has_versioned_opts = true; // We can read runtime_path as of opts version 2. if (opts && opts->version >= 2) { set_runtime_path(opts->runtime_path); } /* initialize the library */ lxcfslib_init(); return opts; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/bindings.h0000664000175000017500000001132214773561567015667 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_BINDINGS_H #define __LXCFS_BINDINGS_H #include "config.h" #include #include #include #include #include #include #include #include #include #include "lxcfs_fuse.h" #include "cgroup_fuse.h" #include "macro.h" #include "proc_cpuview.h" #include "proc_fuse.h" #include "proc_loadavg.h" #include "sysfs_fuse.h" /* Maximum number for 64 bit integer is a string with 21 digits: 2^64 - 1 = 21 */ #define LXCFS_NUMSTRLEN64 21 /* The definitions here are well-ordered. New values should go directly * above LXC_TYPE_MAX only. */ enum lxcfs_virt_t { LXC_TYPE_CGDIR, LXC_TYPE_CGFILE, LXC_TYPE_PROC_MEMINFO, #define LXC_TYPE_PROC_MEMINFO_PATH "/proc/meminfo" LXC_TYPE_PROC_CPUINFO, #define LXC_TYPE_PROC_CPUINFO_PATH "/proc/cpuinfo" LXC_TYPE_PROC_UPTIME, #define LXC_TYPE_PROC_UPTIME_PATH "/proc/uptime" LXC_TYPE_PROC_STAT, #define LXC_TYPE_PROC_STAT_PATH "/proc/stat" LXC_TYPE_PROC_DISKSTATS, #define LXC_TYPE_PROC_DISKSTATS_PATH "/proc/diskstats" LXC_TYPE_PROC_SWAPS, #define LXC_TYPE_PROC_SWAPS_PATH "/proc/swaps" LXC_TYPE_PROC_LOADAVG, #define LXC_TYPE_PROC_LOADAVG_PATH "/proc/loadavg" LXC_TYPE_PROC_SLABINFO, #define LXC_TYPE_PROC_SLABINFO_PATH "/proc/slabinfo" LXC_TYPE_SYS, LXC_TYPE_SYS_DEVICES, LXC_TYPE_SYS_DEVICES_SYSTEM, LXC_TYPE_SYS_DEVICES_SYSTEM_CPU, LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_SUBDIR, LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_SUBFILE, LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE, #define LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE_PATH "/sys/devices/system/cpu/online" LXC_TYPE_MAX, }; /* Macros below used to check the class from the file types above */ #define LXCFS_TYPE_CGROUP(type) (type >= LXC_TYPE_CGDIR && type <= LXC_TYPE_CGFILE) #define LXCFS_TYPE_PROC(type) (type >= LXC_TYPE_PROC_MEMINFO && type <= LXC_TYPE_PROC_SLABINFO) #define LXCFS_TYPE_SYS(type) (type >= LXC_TYPE_SYS && type <= LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE) #define LXCFS_TYPE_OK(type) (type >= LXC_TYPE_CGDIR && type < LXC_TYPE_MAX) /* * This signal will be used to signal fuse request processing thread that * request was interrupted (FUSE_INTERRUPT came from the kernel). * * It's not imporant which signal num is used, but it should not intersect with * any signals those are already handled and used somewhere. * Since, SIGUSR1 and SIGUSR2 are already utilized by lxcfs, let it be SIGTTOU. * * See also: * ("interrupt support") * https://github.com/libfuse/libfuse/commit/288ed4ebcea335c77793ee3d207c7466d55c4f71 */ #define LXCFS_INTR_SIGNAL SIGTTOU extern int mutex_lock_interruptible(pthread_mutex_t *l); extern int rwlock_rdlock_interruptible(pthread_rwlock_t *l); extern int rwlock_wrlock_interruptible(pthread_rwlock_t *l); struct file_info { char *controller; char *cgroup; char *file; int type; char *buf; /* unused */ int buflen; int size; /*actual data size */ int cached; }; struct lxcfs_opts { bool swap_off; bool use_pidfd; bool use_cfs; /* * Ideally we'd version by size but because of backwards compatability * and the use of bool instead of explicited __u32 and __u64 we can't. */ __u32 version; // As of opts version 2. char runtime_path[PATH_MAX]; }; typedef enum lxcfs_opt_t { LXCFS_SWAP_ON = 0, LXCFS_PIDFD_ON = 1, LXCFS_CFS_ON = 2, LXCFS_OPTS_MAX = LXCFS_CFS_ON, } lxcfs_opt_t; extern pid_t lookup_initpid_in_store(pid_t qpid); extern void prune_init_slice(char *cg); extern bool supports_pidfd(void); extern bool liblxcfs_functional(void); extern bool liblxcfs_can_use_swap(void); extern bool liblxcfs_memory_is_cgroupv2(void); extern bool liblxcfs_can_use_sys_cpu(void); extern bool liblxcfs_has_versioned_opts(void); extern __u32 liblxcfs_personality(void); static inline bool lxcfs_has_opt(struct lxcfs_opts *opts, lxcfs_opt_t opt) { if (!opts) return false; if (opt > LXCFS_OPTS_MAX) return false; switch (opt) { case LXCFS_SWAP_ON: if (!opts->swap_off) return liblxcfs_can_use_swap(); return false; case LXCFS_PIDFD_ON: return opts->use_pidfd; case LXCFS_CFS_ON: return opts->use_cfs; } return false; } static inline int install_signal_handler(int signo, void (*handler)(int, siginfo_t *, void *)) { struct sigaction action = { .sa_flags = SA_SIGINFO, .sa_sigaction = handler, }; return sigaction(signo, &action, NULL); } extern pid_t lxcfs_raw_clone(unsigned long flags, int *pidfd); static inline pid_t lxcfs_clone(int (*fn)(void *), void *arg, int flags) { pid_t pid; pid = lxcfs_raw_clone(flags, NULL); if (pid < 0) return -1; if (pid == 0) _exit(fn(arg)); return pid; } __visible extern void *lxcfs_fuse_init(struct fuse_conn_info *conn, void *data); #endif /* __LXCFS_BINDINGS_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroup_fuse.c0000664000175000017500000013271614773561567016421 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "cgroup_fuse.h" #include "bindings.h" #include "cgroups/cgroup.h" #include "cgroups/cgroup_utils.h" #include "lxcfs_fuse_compat.h" #include "memory_utils.h" #include "utils.h" struct cgfs_files { char *name; uint32_t uid, gid; uint32_t mode; }; struct pid_ns_clone_args { int *cpipe; int sock; pid_t tpid; /* pid_from_ns or pid_to_ns. */ int (*wrapped) (int, pid_t); }; static inline int get_cgroup_fd_handle_named(const char *controller) { if (controller && strcmp(controller, "systemd") == 0) return get_cgroup_fd("name=systemd"); return get_cgroup_fd(controller); } static char *get_pid_cgroup_handle_named(pid_t pid, const char *controller) { if (controller && strcmp(controller, "systemd") == 0) return get_pid_cgroup(pid, "name=systemd"); return get_pid_cgroup(pid, controller); } static bool get_cgroup_handle_named(struct cgroup_ops *ops, const char *controller, const char *cgroup, const char *file, char **value) { if (controller && strcmp(controller, "systemd") == 0) return cgroup_ops->get(ops, "name=systemd", cgroup, file, value); return cgroup_ops->get(cgroup_ops, controller, cgroup, file, value); } /* * given /cgroup/freezer/a/b, return "freezer". * the returned char* should NOT be freed. */ static char *pick_controller_from_path(struct fuse_context *fc, const char *path) { const char *p1; char *contr, *slash; if (strlen(path) < 9) { errno = EACCES; return NULL; } if (*(path + 7) != '/') { errno = EINVAL; return NULL; } p1 = path + 8; contr = strdupa(p1); if (!contr) { errno = ENOMEM; return NULL; } slash = strstr(contr, "/"); if (slash) *slash = '\0'; for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { if ((*h)->__controllers && strcmp((*h)->__controllers, contr) == 0) return (*h)->__controllers; } errno = ENOENT; return NULL; } /* * Find the start of cgroup in /cgroup/controller/the/cgroup/path * Note that the returned value may include files (keynames) etc */ static const char *find_cgroup_in_path(const char *path) { const char *p1; if (strlen(path) < 9) { errno = EACCES; return NULL; } p1 = strstr(path + 8, "/"); if (!p1) { errno = EINVAL; return NULL; } errno = 0; return p1 + 1; } /* * split the last path element from the path in @cg. * @dir is newly allocated and should be freed, @last not */ static void get_cgdir_and_path(const char *cg, char **dir, char **last) { char *p; do { *dir = strdup(cg); } while (!*dir); *last = strrchr(cg, '/'); if (!*last) { *last = NULL; return; } p = strrchr(*dir, '/'); *p = '\0'; } static bool is_child_cgroup(const char *controller, const char *cgroup, const char *file) { __do_free char *path = NULL; int cfd, ret; struct stat sb; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return false; path = must_make_path_relative(cgroup, file, NULL); ret = fstatat(cfd, path, &sb, 0); if (ret < 0 || !S_ISDIR(sb.st_mode)) return false; return true; } /* * If pid is in /a/b/c, they may see that /a exists, but not /b or /a/c. */ static bool caller_may_see_dir(pid_t pid, const char *contrl, const char *cg) { bool answer = false; char *c2, *task_cg; size_t target_len, task_len; if (strcmp(cg, "/") == 0 || strcmp(cg, "./") == 0) return true; c2 = get_pid_cgroup_handle_named(pid, contrl); if (!c2) return false; prune_init_slice(c2); task_cg = c2 + 1; target_len = strlen(cg); task_len = strlen(task_cg); if (task_len == 0) { /* Task is in the root cg, it can see everything. This case is * not handled by the strmcps below, since they test for the * last /, but that is the first / that we've chopped off * above. */ answer = true; goto out; } if (strcmp(cg, task_cg) == 0) { answer = true; goto out; } if (target_len < task_len) { /* looking up a parent dir */ if (strncmp(task_cg, cg, target_len) == 0 && task_cg[target_len] == '/') answer = true; goto out; } if (target_len > task_len) { /* looking up a child dir */ if (strncmp(task_cg, cg, task_len) == 0 && cg[task_len] == '/') answer = true; goto out; } out: free(c2); return answer; } /* * taskcg is a/b/c * querycg is /a/b/c/d/e * we return 'd' */ static char *get_next_cgroup_dir(const char *taskcg, const char *querycg) { char *start, *end; if (strlen(taskcg) <= strlen(querycg)) { lxcfs_error("%s\n", "I was fed bad input."); return NULL; } if ((strcmp(querycg, "/") == 0) || (strcmp(querycg, "./") == 0)) start = strdup(taskcg + 1); else start = strdup(taskcg + strlen(querycg) + 1); if (!start) return NULL; end = strchr(start, '/'); if (end) *end = '\0'; return start; } /* * If pid is in /a/b/c/d, they may only act on things under cg=/a/b/c/d. * If pid is in /a, they may act on /a/b, but not on /b. * if the answer is false and nextcg is not NULL, then *nextcg will point * to a string containing the next cgroup directory under cg, which must be * freed by the caller. */ static bool caller_is_in_ancestor(pid_t pid, const char *contrl, const char *cg, char **nextcg) { bool answer = false; char *c2; char *linecmp; c2 = get_pid_cgroup_handle_named(pid, contrl); if (!c2) return false; prune_init_slice(c2); /* * callers pass in '/' or './' (openat()) for root cgroup, otherwise * they pass in a cgroup without leading '/' * * The original line here was: * linecmp = *cg == '/' ? c2 : c2+1; * TODO: I'm not sure why you'd want to increment when *cg != '/'? * Serge, do you know? */ if (*cg == '/' || !strncmp(cg, "./", 2)) linecmp = c2; else linecmp = c2 + 1; if (strncmp(linecmp, cg, strlen(linecmp)) != 0) { if (nextcg) { *nextcg = get_next_cgroup_dir(linecmp, cg); } goto out; } answer = true; out: free(c2); return answer; } static struct cgfs_files *cgfs_get_key(const char *controller, const char *cgroup, const char *file) { __do_free char *path = NULL; struct cgfs_files *newkey; int cfd, ret; struct stat sb; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return false; if (file && *file == '/') file++; if (file && strchr(file, '/')) return NULL; if (file) path = must_make_path_relative(cgroup, file, NULL); else path = must_make_path_relative(cgroup, NULL); ret = fstatat(cfd, path, &sb, 0); if (ret < 0) return NULL; newkey = must_realloc(NULL, sizeof(struct cgfs_files)); if (file) newkey->name = must_copy_string(file); else if (strrchr(cgroup, '/')) newkey->name = must_copy_string(strrchr(cgroup, '/')); else newkey->name = must_copy_string(cgroup); newkey->uid = sb.st_uid; newkey->gid = sb.st_gid; newkey->mode = sb.st_mode; return newkey; } /* * Given a open file * to /proc/pid/{u,g}id_map, and an id * valid in the caller's namespace, return the id mapped into * pid's namespace. * Returns the mapped id, or -1 on error. */ static int convert_id_to_ns(FILE *idfile, unsigned int in_id) { unsigned int nsuid, // base id for a range in the idfile's namespace hostuid, // base id for a range in the caller's namespace count; // number of ids in this range char line[400]; int ret; fseek(idfile, 0L, SEEK_SET); while (fgets(line, 400, idfile)) { ret = sscanf(line, "%u %u %u\n", &nsuid, &hostuid, &count); if (ret != 3) continue; if (hostuid + count < hostuid || nsuid + count < nsuid) { /* * uids wrapped around - unexpected as this is a procfile, * so just bail. */ lxcfs_error("pid wrapparound at entry %u %u %u in %s\n", nsuid, hostuid, count, line); return -1; } if (hostuid <= in_id && hostuid+count > in_id) { /* * now since hostuid <= in_id < hostuid+count, and * hostuid+count and nsuid+count do not wrap around, * we know that nsuid+(in_id-hostuid) which must be * less that nsuid+(count) must not wrap around */ return (in_id - hostuid) + nsuid; } } // no answer found return -1; } /* * for is_privileged_over, * specify whether we require the calling uid to be root in their * namespace */ #define NS_ROOT_REQD true #define NS_ROOT_OPT false #define PROCLEN 100 static bool is_privileged_over(pid_t pid, uid_t uid, uid_t victim, bool req_ns_root) { FILE *f; char fpath[PROCLEN]; int ret; bool answer = false; uid_t nsuid; if (victim == (uid_t)-1 || uid == (uid_t)-1) return false; /* * If the request is one not requiring root in the namespace, * then having the same uid suffices. (i.e. uid 1000 has write * access to files owned by uid 1000 */ if (!req_ns_root && uid == victim) return true; ret = snprintf(fpath, PROCLEN, "/proc/%d/uid_map", pid); if (ret < 0 || ret >= PROCLEN) return false; f = fopen(fpath, "re"); if (!f) return false; /* if caller's not root in their namespace, reject */ nsuid = convert_id_to_ns(f, uid); if (nsuid) goto out; /* * If victim is not mapped into caller's ns, reject. * XXX I'm not sure this check is needed given that fuse * will be sending requests where the vfs has converted */ nsuid = convert_id_to_ns(f, victim); if (nsuid == (uid_t)-1) goto out; answer = true; out: fclose(f); return answer; } static bool perms_include(int fmode, mode_t req_mode) { mode_t r; switch (req_mode & O_ACCMODE) { case O_RDONLY: r = S_IROTH; break; case O_WRONLY: r = S_IWOTH; break; case O_RDWR: r = S_IROTH | S_IWOTH; break; default: return false; } return ((fmode & r) == r); } static void free_key(struct cgfs_files *k) { if (k) { free_disarm(k->name); free_disarm(k); } } /* * check whether a fuse context may access a cgroup dir or file * * If file is not null, it is a cgroup file to check under cg. * If file is null, then we are checking perms on cg itself. * * For files we can check the mode of the list_keys result. * For cgroups, we must make assumptions based on the files under the * cgroup, because cgmanager doesn't tell us ownership/perms of cgroups * yet. */ static bool fc_may_access(struct fuse_context *fc, const char *contrl, const char *cg, const char *file, mode_t mode) { struct cgfs_files *k = NULL; bool ret = false; k = cgfs_get_key(contrl, cg, file); if (!k) return false; if (is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) { if (perms_include(k->mode >> 6, mode)) { ret = true; goto out; } } if (fc->gid == k->gid) { if (perms_include(k->mode >> 3, mode)) { ret = true; goto out; } } ret = perms_include(k->mode, mode); out: free_key(k); return ret; } __lxcfs_fuse_ops int cg_getattr(const char *path, struct stat *sb) { struct timespec now; struct fuse_context *fc = fuse_get_context(); char * cgdir = NULL; char *last = NULL, *path1, *path2; struct cgfs_files *k = NULL; const char *cgroup; const char *controller = NULL; int ret = -ENOENT; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops) return -EIO; memset(sb, 0, sizeof(struct stat)); if (clock_gettime(CLOCK_REALTIME, &now) < 0) return -EINVAL; sb->st_uid = sb->st_gid = 0; sb->st_atim = sb->st_mtim = sb->st_ctim = now; sb->st_size = 0; if (pure_unified_layout(cgroup_ops) || strcmp(path, "/cgroup") == 0) { sb->st_mode = S_IFDIR | 00755; sb->st_nlink = 2; return 0; } controller = pick_controller_from_path(fc, path); if (!controller) return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { /* this is just /cgroup/controller, return it as a dir */ sb->st_mode = S_IFDIR | 00755; sb->st_nlink = 2; return 0; } get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { path1 = "/"; path2 = cgdir; } else { path1 = cgdir; path2 = last; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; /* check that cgcopy is either a child cgroup of cgdir, or listed in its keys. * Then check that caller's cgroup is under path if last is a child * cgroup, or cgdir if last is a file */ if (is_child_cgroup(controller, path1, path2)) { if (!caller_may_see_dir(initpid, controller, cgroup)) { ret = -ENOENT; goto out; } if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) { /* this is just /cgroup/controller, return it as a dir */ sb->st_mode = S_IFDIR | 00555; sb->st_nlink = 2; ret = 0; goto out; } if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) { ret = -EACCES; goto out; } // get uid, gid, from '/tasks' file and make up a mode // That is a hack, until cgmanager gains a GetCgroupPerms fn. sb->st_mode = S_IFDIR | 00755; k = cgfs_get_key(controller, cgroup, NULL); if (!k) { sb->st_uid = sb->st_gid = 0; } else { sb->st_uid = k->uid; sb->st_gid = k->gid; } free_key(k); sb->st_nlink = 2; ret = 0; goto out; } if ((k = cgfs_get_key(controller, path1, path2)) != NULL) { sb->st_mode = S_IFREG | k->mode; sb->st_nlink = 1; sb->st_uid = k->uid; sb->st_gid = k->gid; sb->st_size = 4096; free_key(k); if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) { ret = -ENOENT; goto out; } ret = 0; } out: free(cgdir); return ret; } /* * Chown all the files in the cgroup directory. We do this when we create a * cgroup on behalf of a user. */ static void chown_all_cgroup_files(const char *dirname, uid_t uid, gid_t gid, int fd) { struct dirent *direntp; char path[MAXPATHLEN]; size_t len; DIR *d; int fd1, ret; len = strlen(dirname); if (len >= MAXPATHLEN) { lxcfs_error("Pathname too long: %s\n", dirname); return; } fd1 = openat(fd, dirname, O_DIRECTORY); if (fd1 < 0) return; d = fdopendir(fd1); if (!d) { lxcfs_error("Failed to open %s\n", dirname); return; } while ((direntp = readdir(d))) { if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) continue; ret = snprintf(path, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (ret < 0 || ret >= MAXPATHLEN) { lxcfs_error("Pathname too long under %s\n", dirname); continue; } if (fchownat(fd, path, uid, gid, 0) < 0) lxcfs_error("Failed to chown file %s to %u:%u", path, uid, gid); } closedir(d); } static int cgfs_create(const char *controller, const char *cg, uid_t uid, gid_t gid) { __do_free char *path = NULL; int cfd; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return -EINVAL; path = must_make_path_relative(cg, NULL); if (mkdirat(cfd, path, 0755) < 0) return -errno; if (uid == 0 && gid == 0) return 0; if (fchownat(cfd, path, uid, gid, 0) < 0) return -errno; chown_all_cgroup_files(path, uid, gid, cfd); return 0; } __lxcfs_fuse_ops int cg_mkdir(const char *path, mode_t mode) { struct fuse_context *fc = fuse_get_context(); char *last = NULL, *path1, *cgdir = NULL, *controller, *next = NULL; const char *cgroup; int ret; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; controller = pick_controller_from_path(fc, path); if (!controller) return errno == ENOENT ? -EPERM : -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) path1 = "/"; else path1 = cgdir; pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, path1, &next)) { if (!next) ret = -EINVAL; else if (last && strcmp(next, last) == 0) ret = -EEXIST; else ret = -EPERM; goto out; } if (!fc_may_access(fc, controller, path1, NULL, O_RDWR)) { ret = -EACCES; goto out; } if (!caller_is_in_ancestor(initpid, controller, path1, NULL)) { ret = -EACCES; goto out; } ret = cgfs_create(controller, cgroup, fc->uid, fc->gid); out: free(cgdir); free(next); return ret; } static bool recursive_rmdir(const char *dirname, int fd, const int cfd) { __do_close int dupfd = -EBADF; __do_closedir DIR *dir = NULL; bool ret = false; struct dirent *direntp; char pathname[MAXPATHLEN]; dupfd = dup(fd); if (dupfd < 0) return false; dir = fdopendir(dupfd); if (!dir) { lxcfs_debug("Failed to open %s: %s.\n", dirname, strerror(errno)); return false; } move_fd(dupfd); while ((direntp = readdir(dir))) { struct stat mystat; int rc; if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) continue; rc = snprintf(pathname, MAXPATHLEN, "%s/%s", dirname, direntp->d_name); if (rc < 0 || rc >= MAXPATHLEN) { lxcfs_error("%s\n", "Pathname too long."); continue; } rc = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); if (rc) { lxcfs_debug("Failed to stat %s: %s.\n", pathname, strerror(errno)); continue; } if (S_ISDIR(mystat.st_mode)) if (!recursive_rmdir(pathname, fd, cfd)) lxcfs_debug("Error removing %s.\n", pathname); } ret = true; if (unlinkat(cfd, dirname, AT_REMOVEDIR) < 0) { lxcfs_debug("Failed to delete %s: %s.\n", dirname, strerror(errno)); ret = false; } return ret; } static bool cgfs_remove(const char *controller, const char *cgroup) { __do_close int fd = -EBADF; __do_free char *path = NULL; int cfd; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return false; path = must_make_path_relative(cgroup, NULL); fd = openat(cfd, path, O_DIRECTORY); if (fd < 0) return false; return recursive_rmdir(path, fd, cfd); } __lxcfs_fuse_ops int cg_rmdir(const char *path) { struct fuse_context *fc = fuse_get_context(); char *last = NULL, *cgdir = NULL, *controller, *next = NULL; const char *cgroup; int ret; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; controller = pick_controller_from_path(fc, path); if (!controller) /* Someone's trying to delete "/cgroup". */ return -EPERM; cgroup = find_cgroup_in_path(path); if (!cgroup) /* Someone's trying to delete a controller e.g. "/blkio". */ return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { /* Someone's trying to delete a cgroup on the same level as the * "/lxc" cgroup e.g. rmdir "/cgroup/blkio/lxc" or * rmdir "/cgroup/blkio/init.slice". */ ret = -EPERM; goto out; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, controller, cgroup, &next)) { if (!last || (next && (strcmp(next, last) == 0))) ret = -EBUSY; else ret = -ENOENT; goto out; } if (!fc_may_access(fc, controller, cgdir, NULL, O_WRONLY)) { ret = -EACCES; goto out; } if (!caller_is_in_ancestor(initpid, controller, cgroup, NULL)) { ret = -EACCES; goto out; } if (!cgfs_remove(controller, cgroup)) { ret = -EINVAL; goto out; } ret = 0; out: free(cgdir); free(next); return ret; } static bool cgfs_chmod_file(const char *controller, const char *file, mode_t mode) { __do_free char *path = NULL; int cfd; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return false; path = must_make_path_relative(file, NULL); if (fchmodat(cfd, path, mode, 0) < 0) return false; return true; } __lxcfs_fuse_ops int cg_chmod(const char *path, mode_t mode) { struct fuse_context *fc = fuse_get_context(); char * cgdir = NULL, *last = NULL, *path1, *path2, *controller; struct cgfs_files *k = NULL; const char *cgroup; int ret; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (strcmp(path, "/cgroup") == 0) return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) return errno == ENOENT ? -EPERM : -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { path1 = "/"; path2 = cgdir; } else { path1 = cgdir; path2 = last; } if (is_child_cgroup(controller, path1, path2)) { // get uid, gid, from '/tasks' file and make up a mode // That is a hack, until cgmanager gains a GetCgroupPerms fn. k = cgfs_get_key(controller, cgroup, "tasks"); } else k = cgfs_get_key(controller, path1, path2); if (!k) { ret = -EINVAL; goto out; } /* * This being a fuse request, the uid and gid must be valid * in the caller's namespace. So we can just check to make * sure that the caller is root in their uid, and privileged * over the file's current owner. */ if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_OPT)) { ret = -EPERM; goto out; } if (!cgfs_chmod_file(controller, cgroup, mode)) { ret = -EINVAL; goto out; } ret = 0; out: free_key(k); free(cgdir); return ret; } static inline bool is_dir(int dirfd, const char *path) { struct stat st; return fstatat(dirfd, path, &st, 0) == 0 && S_ISDIR(st.st_mode); } static int chown_tasks_files(int dirfd, const char *dirname, uid_t uid, gid_t gid) { __do_free char *path; path = must_make_path_relative(dirname, "tasks", NULL); if (fchownat(dirfd, path, uid, gid, 0) != 0) return -errno; free_disarm(path); path = must_make_path_relative(dirname, "cgroup.procs", NULL); if (fchownat(dirfd, path, uid, gid, 0) != 0) return -errno; return 0; } static int cgfs_chown_file(const char *controller, const char *file, uid_t uid, gid_t gid) { __do_free char *path = NULL; int cfd; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return false; path = must_make_path_relative(file, NULL); if (fchownat(cfd, path, uid, gid, 0) < 0) return -errno; if (is_dir(cfd, path)) return chown_tasks_files(cfd, path, uid, gid); return 0; } __lxcfs_fuse_ops int cg_chown(const char *path, uid_t uid, gid_t gid) { struct fuse_context *fc = fuse_get_context(); char *cgdir = NULL, *last = NULL, *path1, *path2, *controller; struct cgfs_files *k = NULL; const char *cgroup; int ret; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (strcmp(path, "/cgroup") == 0) return -EPERM; controller = pick_controller_from_path(fc, path); if (!controller) return errno == ENOENT ? -EPERM : -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) /* this is just /cgroup/controller */ return -EPERM; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { path1 = "/"; path2 = cgdir; } else { path1 = cgdir; path2 = last; } if (is_child_cgroup(controller, path1, path2)) { // get uid, gid, from '/tasks' file and make up a mode // That is a hack, until cgmanager gains a GetCgroupPerms fn. k = cgfs_get_key(controller, cgroup, "tasks"); } else k = cgfs_get_key(controller, path1, path2); if (!k) { ret = -EINVAL; goto out; } /* * This being a fuse request, the uid and gid must be valid * in the caller's namespace. So we can just check to make * sure that the caller is root in their uid, and privileged * over the file's current owner. */ if (!is_privileged_over(fc->pid, fc->uid, k->uid, NS_ROOT_REQD)) { ret = -EACCES; goto out; } ret = cgfs_chown_file(controller, cgroup, uid, gid); out: free_key(k); free(cgdir); return ret; } __lxcfs_fuse_ops int cg_open(const char *path, struct fuse_file_info *fi) { const char *cgroup; char *last = NULL, *path1, *path2, * cgdir = NULL, *controller; struct cgfs_files *k = NULL; struct file_info *file_info; struct fuse_context *fc = fuse_get_context(); int ret; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; controller = pick_controller_from_path(fc, path); if (!controller) return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) return -errno; get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { path1 = "/"; path2 = cgdir; } else { path1 = cgdir; path2 = last; } k = cgfs_get_key(controller, path1, path2); if (!k) { ret = -EINVAL; goto out; } free_key(k); pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; if (!caller_may_see_dir(initpid, controller, path1)) { ret = -ENOENT; goto out; } if (!fc_may_access(fc, controller, path1, path2, fi->flags)) { ret = -EACCES; goto out; } /* we'll free this at cg_release */ file_info = malloc(sizeof(*file_info)); if (!file_info) { ret = -ENOMEM; goto out; } file_info->controller = must_copy_string(controller); file_info->cgroup = must_copy_string(path1); file_info->file = must_copy_string(path2); file_info->type = LXC_TYPE_CGFILE; file_info->buf = NULL; file_info->buflen = 0; fi->fh = PTR_TO_UINT64(file_info); ret = 0; out: free(cgdir); return ret; } #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP ) /* * pid_to_ns - reads pids from a ucred over a socket, then writes the * int value back over the socket. This shifts the pid from the * sender's pidns into tpid's pidns. */ static int pid_to_ns(int sock, pid_t tpid) { char v = '0'; struct ucred cred = { .pid = -1, .uid = -1, .gid = -1, }; while (recv_creds(sock, &cred, &v)) { if (v == '1') return 0; if (write_nointr(sock, &cred.pid, sizeof(pid_t)) != sizeof(pid_t)) return 1; } return 0; } /* * pid_ns_clone_wrapper - wraps pid_to_ns or pid_from_ns for usage * with clone(). This simply writes '1' as ACK back to the parent * before calling the actual wrapped function. */ static int pid_ns_clone_wrapper(void *arg) { struct pid_ns_clone_args* args = (struct pid_ns_clone_args *) arg; char b = '1'; close(args->cpipe[0]); if (write(args->cpipe[1], &b, sizeof(char)) < 0) lxcfs_error("(child): error on write: %s.\n", strerror(errno)); close(args->cpipe[1]); return args->wrapped(args->sock, args->tpid); } /* * pid_to_ns_wrapper: when you setns into a pidns, you yourself remain * in your old pidns. Only children which you clone will be in the target * pidns. So the pid_to_ns_wrapper does the setns, then clones a child to * actually convert pids. * * Note: glibc's fork() does not respect pidns, which can lead to failed * assertions inside glibc (and thus failed forks) if the child's pid in * the pidns and the parent pid outside are identical. Using clone prevents * this issue. */ static void pid_to_ns_wrapper(int sock, pid_t tpid) { int newnsfd = -1, ret, cpipe[2]; char fnam[100]; pid_t cpid; char v; ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid); if (ret < 0 || (size_t)ret >= sizeof(fnam)) _exit(1); newnsfd = open(fnam, O_RDONLY); if (newnsfd < 0) _exit(1); if (setns(newnsfd, 0) < 0) _exit(1); close(newnsfd); if (pipe(cpipe) < 0) _exit(1); struct pid_ns_clone_args args = { .cpipe = cpipe, .sock = sock, .tpid = tpid, .wrapped = &pid_to_ns }; cpid = lxcfs_clone(pid_ns_clone_wrapper, &args, 0); if (cpid < 0) _exit(1); /* Give the child 1 second to be done forking and write its ack. */ if (!wait_for_sock(cpipe[0], 1)) _exit(1); ret = read(cpipe[0], &v, 1); if (ret != sizeof(char) || v != '1') _exit(1); if (!wait_for_pid(cpid)) _exit(1); _exit(0); } /* * append pid to *src. * src: a pointer to a char* in which ot append the pid. * sz: the number of characters printed so far, minus trailing \0. * asz: the allocated size so far * pid: the pid to append */ static void must_strcat_pid(char **src, size_t *sz, size_t *asz, pid_t pid) { must_strcat(src, sz, asz, "%d\n", (int)pid); } /* * To read cgroup files with a particular pid, we will setns into the child * pidns, open a pipe, fork a child - which will be the first to really be in * the child ns - which does the cgfs_get_value and writes the data to the pipe. */ static bool do_read_pids(pid_t tpid, const char *contrl, const char *cg, const char *file, char **d) { int sock[2] = {-1, -1}; char *tmpdata = NULL; int ret; pid_t qpid, cpid = -1; bool answer = false; char v = '0'; struct ucred cred; size_t sz = 0, asz = 0; if (!get_cgroup_handle_named(cgroup_ops, contrl, cg, file, &tmpdata)) return false; /* * Now we read the pids from returned data one by one, pass * them into a child in the target namespace, read back the * translated pids, and put them into our to-return data */ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { perror("socketpair"); free(tmpdata); return false; } cpid = fork(); if (cpid == -1) goto out; if (!cpid) // child - exits when done pid_to_ns_wrapper(sock[1], tpid); char *ptr = tmpdata; cred.uid = 0; cred.gid = 0; while (sscanf(ptr, "%d\n", &qpid) == 1) { cred.pid = qpid; ret = send_creds(sock[0], &cred, v, true); if (ret == SEND_CREDS_NOTSK) goto next; if (ret == SEND_CREDS_FAIL) goto out; // read converted results if (!wait_for_sock(sock[0], 2)) { lxcfs_error("Timed out waiting for pid from child: %s.\n", strerror(errno)); goto out; } if (read(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { lxcfs_error("Error reading pid from child: %s.\n", strerror(errno)); goto out; } must_strcat_pid(d, &sz, &asz, qpid); next: ptr = strchr(ptr, '\n'); if (!ptr) break; ptr++; } cred.pid = getpid(); v = '1'; if (send_creds(sock[0], &cred, v, true) != SEND_CREDS_OK) { // failed to ask child to exit lxcfs_error("Failed to ask child to exit: %s.\n", strerror(errno)); goto out; } answer = true; out: free(tmpdata); if (cpid != -1) wait_for_pid(cpid); if (sock[0] != -1) { close(sock[0]); close(sock[1]); } return answer; } __lxcfs_fuse_ops int cg_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { struct fuse_context *fc = fuse_get_context(); struct file_info *f = INTTYPE_TO_PTR(fi->fh); struct cgfs_files *k = NULL; char *data = NULL; int ret; size_t s; bool r; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (f->type != LXC_TYPE_CGFILE) { lxcfs_error("%s\n", "Internal error: directory cache info used in cg_read."); return -EIO; } if (offset) return 0; if (!f->controller) return -EINVAL; if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) { return -EINVAL; } free_key(k); if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_RDONLY)) { ret = -EACCES; goto out; } if (strcmp(f->file, "tasks") == 0 || strcmp(f->file, "/tasks") == 0 || strcmp(f->file, "/cgroup.procs") == 0 || strcmp(f->file, "cgroup.procs") == 0) // special case - we have to translate the pids r = do_read_pids(fc->pid, f->controller, f->cgroup, f->file, &data); else r = get_cgroup_handle_named(cgroup_ops, f->controller, f->cgroup, f->file, &data); if (!r) { ret = -EINVAL; goto out; } if (!data) { ret = 0; goto out; } s = strlen(data); if (s > size) s = size; memcpy(buf, data, s); if ((s > 0) && (s < size) && (data[s - 1] != '\n')) buf[s++] = '\n'; ret = s; out: free(data); return ret; } __lxcfs_fuse_ops int cg_opendir(const char *path, struct fuse_file_info *fi) { struct fuse_context *fc = fuse_get_context(); const char *cgroup; struct file_info *dir_info; char *controller = NULL; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops) return -EIO; if (pure_unified_layout(cgroup_ops) || strcmp(path, "/cgroup") == 0) { cgroup = NULL; controller = NULL; } else { // return list of keys for the controller, and list of child cgroups controller = pick_controller_from_path(fc, path); if (!controller) return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { /* this is just /cgroup/controller, return its contents */ cgroup = "/"; } } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; if (cgroup) { if (!caller_may_see_dir(initpid, controller, cgroup)) return -ENOENT; if (!fc_may_access(fc, controller, cgroup, NULL, O_RDONLY)) return -EACCES; } /* we'll free this at cg_releasedir */ dir_info = malloc(sizeof(*dir_info)); if (!dir_info) return -ENOMEM; dir_info->controller = must_copy_string(controller); dir_info->cgroup = must_copy_string(cgroup); dir_info->type = LXC_TYPE_CGDIR; dir_info->buf = NULL; dir_info->file = NULL; dir_info->buflen = 0; fi->fh = PTR_TO_UINT64(dir_info); return 0; } __lxcfs_fuse_ops int cg_release(const char *path, struct fuse_file_info *fi) { do_release_file_info(fi); return 0; } __lxcfs_fuse_ops int cg_releasedir(const char *path, struct fuse_file_info *fi) { do_release_file_info(fi); return 0; } static FILE *open_pids_file(const char *controller, const char *cgroup) { __do_close int fd = -EBADF; __do_free char *path = NULL; int cfd; FILE *f; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return false; path = must_make_path_relative(cgroup, "cgroup.procs", NULL); fd = openat(cfd, path, O_WRONLY | O_CLOEXEC); if (fd < 0) return NULL; f = fdopen(fd, "we"); if (!f) return NULL; /* Transfer ownership of fd to fdopen(). */ move_fd(fd); return f; } static int pid_from_ns(int sock, pid_t tpid) { pid_t vpid; struct ucred cred; char v; int ret; cred.uid = 0; cred.gid = 0; while (1) { if (!wait_for_sock(sock, 2)) { lxcfs_error("%s\n", "Timeout reading from parent."); return 1; } if ((ret = read(sock, &vpid, sizeof(pid_t))) != sizeof(pid_t)) { lxcfs_error("Bad read from parent: %s.\n", strerror(errno)); return 1; } if (vpid == -1) // done break; v = '0'; cred.pid = vpid; if (send_creds(sock, &cred, v, true) != SEND_CREDS_OK) { v = '1'; cred.pid = getpid(); if (send_creds(sock, &cred, v, false) != SEND_CREDS_OK) return 1; } } return 0; } static void pid_from_ns_wrapper(int sock, pid_t tpid) { int newnsfd = -1, ret, cpipe[2]; char fnam[100]; pid_t cpid; char v; ret = snprintf(fnam, sizeof(fnam), "/proc/%d/ns/pid", tpid); if (ret < 0 || (size_t)ret >= sizeof(fnam)) _exit(1); newnsfd = open(fnam, O_RDONLY); if (newnsfd < 0) _exit(1); if (setns(newnsfd, 0) < 0) _exit(1); close(newnsfd); if (pipe(cpipe) < 0) _exit(1); struct pid_ns_clone_args args = { .cpipe = cpipe, .sock = sock, .tpid = tpid, .wrapped = &pid_from_ns }; cpid = lxcfs_clone(pid_ns_clone_wrapper, &args, 0); if (cpid < 0) _exit(1); // give the child 1 second to be done forking and // write its ack if (!wait_for_sock(cpipe[0], 1)) _exit(1); ret = read(cpipe[0], &v, 1); if (ret != sizeof(char) || v != '1') _exit(1); if (!wait_for_pid(cpid)) _exit(1); _exit(0); } /* * get_pid_creds: get the real uid and gid of @pid from * /proc/$$/status * (XXX should we use euid here?) */ static void get_pid_creds(pid_t pid, uid_t *uid, gid_t *gid) { char line[400]; uid_t u; gid_t g; FILE *f; *uid = -1; *gid = -1; sprintf(line, "/proc/%d/status", pid); if ((f = fopen(line, "re")) == NULL) { lxcfs_error("Error opening %s: %s\n", line, strerror(errno)); return; } while (fgets(line, 400, f)) { if (strncmp(line, "Uid:", 4) == 0) { if (sscanf(line+4, "%u", &u) != 1) { lxcfs_error("bad uid line for pid %u\n", pid); fclose(f); return; } *uid = u; } else if (strncmp(line, "Gid:", 4) == 0) { if (sscanf(line+4, "%u", &g) != 1) { lxcfs_error("bad gid line for pid %u\n", pid); fclose(f); return; } *gid = g; } } fclose(f); } /* * Given host @uid, return the uid to which it maps in * @pid's user namespace, or -1 if none. */ static bool hostuid_to_ns(uid_t uid, pid_t pid, uid_t *answer) { FILE *f; char line[400]; sprintf(line, "/proc/%d/uid_map", pid); if ((f = fopen(line, "re")) == NULL) { return false; } *answer = convert_id_to_ns(f, uid); fclose(f); if (*answer == (uid_t)-1) return false; return true; } /* * May the requestor @r move victim @v to a new cgroup? * This is allowed if * . they are the same task * . they are ownedy by the same uid * . @r is root on the host, or * . @v's uid is mapped into @r's where @r is root. */ static bool may_move_pid(pid_t r, uid_t r_uid, pid_t v) { uid_t v_uid, tmpuid; gid_t v_gid; if (r == v) return true; if (r_uid == 0) return true; get_pid_creds(v, &v_uid, &v_gid); if (r_uid == v_uid) return true; if (hostuid_to_ns(r_uid, r, &tmpuid) && tmpuid == 0 && hostuid_to_ns(v_uid, r, &tmpuid)) return true; return false; } static bool do_write_pids(pid_t tpid, uid_t tuid, const char *contrl, const char *cg, const char *file, const char *buf) { int sock[2] = {-1, -1}; pid_t qpid, cpid = -1; FILE *pids_file = NULL; bool answer = false, fail = false; pids_file = open_pids_file(contrl, cg); if (!pids_file) return false; /* * write the pids to a socket, have helper in writer's pidns * call movepid for us */ if (socketpair(AF_UNIX, SOCK_DGRAM, 0, sock) < 0) { perror("socketpair"); goto out; } cpid = fork(); if (cpid == -1) goto out; if (!cpid) { // child fclose(pids_file); pid_from_ns_wrapper(sock[1], tpid); } const char *ptr = buf; while (sscanf(ptr, "%d", &qpid) == 1) { struct ucred cred; char v; if (write(sock[0], &qpid, sizeof(qpid)) != sizeof(qpid)) { lxcfs_error("Error writing pid to child: %s.\n", strerror(errno)); goto out; } if (recv_creds(sock[0], &cred, &v)) { if (v == '0') { if (!may_move_pid(tpid, tuid, cred.pid)) { fail = true; break; } if (fprintf(pids_file, "%d", (int) cred.pid) < 0) fail = true; } } ptr = strchr(ptr, '\n'); if (!ptr) break; ptr++; } /* All good, write the value */ qpid = -1; if (write(sock[0], &qpid ,sizeof(qpid)) != sizeof(qpid)) lxcfs_error("%s\n", "Warning: failed to ask child to exit."); if (!fail) answer = true; out: if (cpid != -1) wait_for_pid(cpid); if (sock[0] != -1) { close(sock[0]); close(sock[1]); } if (pids_file) { if (fclose(pids_file) != 0) answer = false; } return answer; } static bool cgfs_set_value(const char *controller, const char *cgroup, const char *file, const char *value) { __do_close int fd = -EBADF; __do_free char *path = NULL; int cfd; size_t len; ssize_t ret; cfd = get_cgroup_fd_handle_named(controller); if (cfd < 0) return false; path = must_make_path_relative(cgroup, file, NULL); fd = openat(cfd, path, O_WRONLY | O_CLOEXEC); if (fd < 0) return false; len = strlen(value); ret = write_nointr(fd, value, len); if (ret < 0) return false; return (size_t)ret == len; } __lxcfs_fuse_ops int cg_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { struct fuse_context *fc = fuse_get_context(); char *localbuf = NULL; struct cgfs_files *k = NULL; struct file_info *f = INTTYPE_TO_PTR(fi->fh); bool r; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (f->type != LXC_TYPE_CGFILE) { lxcfs_error("%s\n", "Internal error: directory cache info used in cg_write."); return -EIO; } if (offset) return 0; localbuf = alloca(size+1); localbuf[size] = '\0'; memcpy(localbuf, buf, size); if ((k = cgfs_get_key(f->controller, f->cgroup, f->file)) == NULL) { size = -EINVAL; goto out; } if (!fc_may_access(fc, f->controller, f->cgroup, f->file, O_WRONLY)) { size = -EACCES; goto out; } if (strcmp(f->file, "tasks") == 0 || strcmp(f->file, "/tasks") == 0 || strcmp(f->file, "/cgroup.procs") == 0 || strcmp(f->file, "cgroup.procs") == 0) // special case - we have to translate the pids r = do_write_pids(fc->pid, fc->uid, f->controller, f->cgroup, f->file, localbuf); else r = cgfs_set_value(f->controller, f->cgroup, f->file, localbuf); if (!r) size = -EINVAL; out: free_key(k); return size; } static bool cgfs_iterate_cgroup(const char *controller, const char *cgroup, bool directories, void ***list, size_t typesize, void *(*iterator)(const char *, const char *, const char *)) { __do_close int fd = -EBADF; __do_free char *path = NULL; __do_closedir DIR *dir = NULL; size_t sz = 0, asz = 0; int cfd; struct dirent *dirent; cfd = get_cgroup_fd_handle_named(controller); *list = NULL; if (cfd < 0) return false; path = must_make_path_relative(cgroup, NULL); fd = openat(cfd, path, O_DIRECTORY | O_CLOEXEC); if (fd < 0) return false; dir = fdopendir(fd); if (!dir) return false; /* Transfer ownership of fd to fdopendir(). */ move_fd(fd); while ((dirent = readdir(dir))) { int ret; char pathname[MAXPATHLEN]; struct stat mystat; if (strcmp(dirent->d_name, ".") == 0) continue; if (strcmp(dirent->d_name, "..") == 0) continue; ret = snprintf(pathname, sizeof(pathname), "%s/%s", path, dirent->d_name); if (ret < 0 || (size_t)ret >= sizeof(pathname)) { lxcfs_error("Pathname too long under %s\n", path); continue; } ret = fstatat(cfd, pathname, &mystat, AT_SYMLINK_NOFOLLOW); if (ret) { lxcfs_error("Failed to stat %s: %s\n", pathname, strerror(errno)); continue; } if (!directories && !S_ISREG(mystat.st_mode)) continue; if (directories && !S_ISDIR(mystat.st_mode)) continue; if (sz + 2 >= asz) { asz += BATCH_SIZE; *list = must_realloc(*list, asz * typesize); } (*list)[sz] = (*iterator)(controller, path, dirent->d_name); (*list)[sz + 1] = NULL; sz++; } return true; } static void *make_key_list_entry(const char *controller, const char *cgroup, const char *dir_entry) { struct cgfs_files *entry; entry = cgfs_get_key(controller, cgroup, dir_entry); if (!entry) lxcfs_error("Failed to retrieve files under %s:%s\n", controller, cgroup); return entry; } static bool cgfs_list_keys(const char *controller, const char *cgroup, struct cgfs_files ***keys) { return cgfs_iterate_cgroup(controller, cgroup, false, (void ***)keys, sizeof(*keys), &make_key_list_entry); } static void *make_children_list_entry(const char *controller, const char *cgroup, const char *dir_entry) { return strdup(dir_entry); } static bool cgfs_list_children(const char *controller, const char *cgroup, char ***list) { return cgfs_iterate_cgroup(controller, cgroup, true, (void ***)list, sizeof(*list), &make_children_list_entry); } static void free_keys(struct cgfs_files **keys) { if (!keys) return; for (int i = 0; keys[i]; i++) free_key(keys[i]); free_disarm(keys); } __lxcfs_fuse_ops int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { struct file_info *d = INTTYPE_TO_PTR(fi->fh); struct cgfs_files **list = NULL; int i, ret; char *nextcg = NULL; struct fuse_context *fc = fuse_get_context(); char **clist = NULL; if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops) return -EIO; if (pure_unified_layout(cgroup_ops)) { ret = 0; goto out; } if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0) return -EIO; if (d->type != LXC_TYPE_CGDIR) { lxcfs_error("%s\n", "Internal error: file cache info used in readdir."); return -EIO; } if (!d->cgroup && !d->controller) { /* * ls /var/lib/lxcfs/cgroup - just show list of controllers. * This only works with the legacy hierarchy. */ for (struct hierarchy **h = cgroup_ops->hierarchies; h && *h; h++) { if (is_unified_hierarchy(*h)) continue; if ((*h)->__controllers && dir_filler(filler, buf, (*h)->__controllers, 0)) return -EIO; } return 0; } if (!cgfs_list_keys(d->controller, d->cgroup, &list)) { // not a valid cgroup ret = -EINVAL; goto out; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; if (!caller_is_in_ancestor(initpid, d->controller, d->cgroup, &nextcg)) { if (nextcg) { ret = dir_filler(filler, buf, nextcg, 0); free(nextcg); if (ret != 0) { ret = -EIO; goto out; } } ret = 0; goto out; } for (i = 0; list && list[i]; i++) { if (dir_filler(filler, buf, list[i]->name, 0) != 0) { ret = -EIO; goto out; } } // now get the list of child cgroups if (!cgfs_list_children(d->controller, d->cgroup, &clist)) { ret = 0; goto out; } if (clist) { for (i = 0; clist[i]; i++) { if (dir_filler(filler, buf, clist[i], 0) != 0) { ret = -EIO; goto out; } } } ret = 0; out: free_keys(list); if (clist) { for (i = 0; clist[i]; i++) free(clist[i]); free(clist); } return ret; } __lxcfs_fuse_ops int cg_access(const char *path, int mode) { int ret; const char *cgroup; char *path1, *path2, *controller; char *last = NULL, *cgdir = NULL; struct cgfs_files *k = NULL; struct fuse_context *fc = fuse_get_context(); if (!liblxcfs_functional()) return -EIO; if (!fc || !cgroup_ops || pure_unified_layout(cgroup_ops)) return -EIO; if (strcmp(path, "/cgroup") == 0) return 0; controller = pick_controller_from_path(fc, path); if (!controller) return -errno; cgroup = find_cgroup_in_path(path); if (!cgroup) { // access("/sys/fs/cgroup/systemd", mode) - rx allowed, w not if ((mode & W_OK) == 0) return 0; return -EACCES; } get_cgdir_and_path(cgroup, &cgdir, &last); if (!last) { path1 = "/"; path2 = cgdir; } else { path1 = cgdir; path2 = last; } k = cgfs_get_key(controller, path1, path2); if (!k) { if ((mode & W_OK) == 0) ret = 0; else ret = -EACCES; goto out; } free_key(k); pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; if (!caller_may_see_dir(initpid, controller, path1)) { ret = -ENOENT; goto out; } if (!fc_may_access(fc, controller, path1, path2, mode)) { ret = -EACCES; goto out; } ret = 0; out: free(cgdir); return ret; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroup_fuse.h0000664000175000017500000000251214773561567016414 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_CGROUP_FUSE_H #define __LXCFS_CGROUP_FUSE_H #include "config.h" #include #include #include #include #include #include "lxcfs_fuse.h" #include "macro.h" __visible extern int cg_getattr(const char *path, struct stat *sb); __visible extern int cg_mkdir(const char *path, mode_t mode); __visible extern int cg_rmdir(const char *path); __visible extern int cg_chmod(const char *path, mode_t mode); __visible extern int cg_chown(const char *path, uid_t uid, gid_t gid); __visible extern int cg_open(const char *path, struct fuse_file_info *fi); __visible extern int cg_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi); __visible extern int cg_opendir(const char *path, struct fuse_file_info *fi); __visible extern int cg_release(const char *path, struct fuse_file_info *fi); __visible extern int cg_releasedir(const char *path, struct fuse_file_info *fi); __visible extern int cg_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi); __visible extern int cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi); __visible extern int cg_access(const char *path, int mode); #endif /* __LXCFS_CGROUP_FUSE_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroups/0000775000175000017500000000000014773561567015404 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroups/cgfsng.c0000664000175000017500000006374514773561567017036 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ /* * cgfs-ng.c: this is a new, simplified implementation of a filesystem * cgroup backend. The original cgfs.c was designed to be as flexible * as possible. It would try to find cgroup filesystems no matter where * or how you had them mounted, and deduce the most usable mount for * each controller. * * This new implementation assumes that cgroup filesystems are mounted * under /sys/fs/cgroup/clist where clist is either the controller, or * a comma-separated list of controllers. */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../macro.h" #include "../memory_utils.h" #include "../utils.h" #include "cgroup.h" #include "cgroup_utils.h" /* Given a pointer to a null-terminated array of pointers, realloc to add one * entry, and point the new entry to NULL. Do not fail. Return the index to the * second-to-last entry - that is, the one which is now available for use * (keeping the list null-terminated). */ static int append_null_to_list(void ***list) { int newentry = 0; if (*list) for (; (*list)[newentry]; newentry++) ; *list = must_realloc(*list, (newentry + 2) * sizeof(void **)); (*list)[newentry + 1] = NULL; return newentry; } /* Given a null-terminated array of strings, check whether @entry is one of the * strings. */ static bool string_in_list(char **list, const char *entry) { int i; if (!list) return false; for (i = 0; list[i]; i++) if (strcmp(list[i], entry) == 0) return true; return false; } /* Return a copy of @entry prepending "name=", i.e. turn "systemd" into * "name=systemd". Do not fail. */ static char *cg_legacy_must_prefix_named(char *entry) { size_t len; char *prefixed; len = strlen(entry); prefixed = must_realloc(NULL, len + 6); memcpy(prefixed, "name=", STRLITERALLEN("name=")); memcpy(prefixed + STRLITERALLEN("name="), entry, len); prefixed[len + 5] = '\0'; return prefixed; } /* Append an entry to the clist. Do not fail. @clist must be NULL the first time * we are called. * * We also handle named subsystems here. Any controller which is not a kernel * subsystem, we prefix "name=". Any which is both a kernel and named subsystem, * we refuse to use because we're not sure which we have here. * (TODO: We could work around this in some cases by just remounting to be * unambiguous, or by comparing mountpoint contents with current cgroup.) * * The last entry will always be NULL. */ static void must_append_controller(char **klist, char **nlist, char ***clist, char *entry) { int newentry; char *copy; if (string_in_list(klist, entry) && string_in_list(nlist, entry)) return; newentry = append_null_to_list((void ***)clist); if (strncmp(entry, "name=", 5) == 0) copy = must_copy_string(entry); else if (string_in_list(klist, entry)) copy = must_copy_string(entry); else copy = cg_legacy_must_prefix_named(entry); (*clist)[newentry] = copy; } /* Given a handler's cgroup data, return the struct hierarchy for the controller * @c, or NULL if there is none. */ static struct hierarchy *cgfsng_get_hierarchy(struct cgroup_ops *ops, const char *controller) { int i; errno = ENOENT; if (!ops->hierarchies) return NULL; for (i = 0; ops->hierarchies[i]; i++) { if (!controller) { /* This is the empty unified hierarchy. */ if (ops->hierarchies[i]->controllers && !ops->hierarchies[i]->controllers[0]) return ops->hierarchies[i]; continue; } else if (pure_unified_layout(ops) && strcmp(controller, "devices") == 0) { if (ops->unified->bpf_device_controller) return ops->unified; break; } if (string_in_list(ops->hierarchies[i]->controllers, controller)) return ops->hierarchies[i]; } return NULL; } /* Given two null-terminated lists of strings, return true if any string is in * both. */ static bool controller_lists_intersect(char **l1, char **l2) { int i; if (!l1 || !l2) return false; for (i = 0; l1[i]; i++) { if (string_in_list(l2, l1[i])) return true; } return false; } /* For a null-terminated list of controllers @clist, return true if any of those * controllers is already listed the null-terminated list of hierarchies @hlist. * Realistically, if one is present, all must be present. */ static bool controller_list_is_dup(struct hierarchy **hlist, char **clist) { int i; if (!hlist) return false; for (i = 0; hlist[i]; i++) if (controller_lists_intersect(hlist[i]->controllers, clist)) return true; return false; } /* Get the controllers from a mountinfo line There are other ways we could get * this info. For lxcfs, field 3 is /cgroup/controller-list. For cgroupfs, we * could parse the mount options. But we simply assume that the mountpoint must * be /sys/fs/cgroup/controller-list */ static char **cg_hybrid_get_controllers(char **klist, char **nlist, char *line, int type, char **controllers) { /* The fourth field is /sys/fs/cgroup/comma-delimited-controller-list * for legacy hierarchies. */ int i; char *p2, *tok; char *p = line, *sep = ","; char **aret = NULL; for (i = 0; i < 4; i++) { p = strchr(p, ' '); if (!p) return NULL; p++; } /* Note, if we change how mountinfo works, then our caller will need to * verify /sys/fs/cgroup/ in this field. */ if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0) return NULL; p += 15; p2 = strchr(p, ' '); if (!p2) return NULL; *p2 = '\0'; if (type == CGROUP_SUPER_MAGIC) { __do_free char *dup = NULL; /* strdup() here for v1 hierarchies. Otherwise * lxc_iterate_parts() will destroy mountpoints such as * "/sys/fs/cgroup/cpu,cpuacct". */ dup = must_copy_string(p); if (!dup) return NULL; lxc_iterate_parts (tok, dup, sep) must_append_controller(klist, nlist, &aret, tok); *controllers = move_ptr(dup); } *p2 = ' '; return aret; } static char **cg_unified_make_empty_controller(void) { int newentry; char **aret = NULL; newentry = append_null_to_list((void ***)&aret); aret[newentry] = NULL; return aret; } static char **cg_unified_get_controllers(const char *file) { __do_free char *buf = NULL; char *sep = " \t\n"; char **aret = NULL; char *tok; buf = read_file(file); if (!buf) return NULL; lxc_iterate_parts(tok, buf, sep) { int newentry; char *copy; newentry = append_null_to_list((void ***)&aret); copy = must_copy_string(tok); aret[newentry] = copy; } return aret; } static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char *mountpoint, char *base_path, int type) { struct hierarchy *new; int newentry; new = zalloc(sizeof(*new)); new->controllers = clist; new->mountpoint = mountpoint; new->base_path = base_path; new->version = type; newentry = append_null_to_list((void ***)h); (*h)[newentry] = new; return new; } /* Get a copy of the mountpoint from @line, which is a line from * /proc/self/mountinfo. */ static char *cg_hybrid_get_mountpoint(char *line) { int i; size_t len; char *p2; char *p = line, *sret = NULL; for (i = 0; i < 4; i++) { p = strchr(p, ' '); if (!p) return NULL; p++; } if (strncmp(p, DEFAULT_CGROUP_MOUNTPOINT "/", 15) != 0) return NULL; p2 = strchr(p + 15, ' '); if (!p2) return NULL; *p2 = '\0'; len = strlen(p); sret = must_realloc(NULL, len + 1); memcpy(sret, p, len); sret[len] = '\0'; return sret; } static void must_append_string(char ***list, char *entry) { int newentry; char *copy; newentry = append_null_to_list((void ***)list); copy = must_copy_string(entry); (*list)[newentry] = copy; } static int get_existing_subsystems(char ***klist, char ***nlist) { __do_free char *line = NULL; __do_fclose FILE *f = NULL; size_t len = 0; f = fopen("/proc/self/cgroup", "re"); if (!f) return -1; while (getline(&line, &len, f) != -1) { char *p, *p2, *tok; p = strchr(line, ':'); if (!p) continue; p++; p2 = strchr(p, ':'); if (!p2) continue; *p2 = '\0'; /* If the kernel has cgroup v2 support, then /proc/self/cgroup * contains an entry of the form: * * 0::/some/path * * In this case we use "cgroup2" as controller name. */ if ((p2 - p) == 0) { must_append_string(klist, "cgroup2"); continue; } lxc_iterate_parts(tok, p, ",") { if (strncmp(tok, "name=", 5) == 0) must_append_string(nlist, tok); else must_append_string(klist, tok); } } return 0; } static void trim(char *s) { size_t len; len = strlen(s); while ((len > 1) && (s[len - 1] == '\n')) s[--len] = '\0'; } /* __cg_mount_direct * * Mount cgroup hierarchies directly without using bind-mounts. The main * uses-cases are mounting cgroup hierarchies in cgroup namespaces and mounting * cgroups for the LXC_AUTO_CGROUP_FULL option. */ static int __cg_mount_direct(struct hierarchy *h, const char *controllerpath) { __do_free char *controllers = NULL; char *fstype = "cgroup2"; unsigned long flags = 0; int ret; flags |= MS_NOSUID; flags |= MS_NOEXEC; flags |= MS_NODEV; flags |= MS_RELATIME; if (h->version != CGROUP2_SUPER_MAGIC) { controllers = lxc_string_join(",", (const char **)h->controllers, false); if (!controllers) return -ENOMEM; fstype = "cgroup"; } ret = mount("cgroup", controllerpath, fstype, flags, controllers); if (ret < 0) return -1; return 0; } static inline int cg_mount_cgroup_full(struct hierarchy *h, const char *controllerpath) { return __cg_mount_direct(h, controllerpath); } static bool cgfsng_mount(struct cgroup_ops *ops, const char *root) { __do_free char *cgroup_root = NULL; int ret; bool retval = false; if (!ops) return ret_set_errno(false, ENOENT); if (!ops->hierarchies) return true; cgroup_root = must_make_path(root, DEFAULT_CGROUP_MOUNTPOINT, NULL); if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) return cg_mount_cgroup_full(ops->unified, cgroup_root) == 0; /* mount tmpfs */ ret = safe_mount(NULL, cgroup_root, "tmpfs", MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, "size=10240k,mode=755", root); if (ret < 0) goto on_error; for (int i = 0; ops->hierarchies[i]; i++) { __do_free char *controllerpath = NULL; struct hierarchy *h = ops->hierarchies[i]; char *controller = strrchr(h->mountpoint, '/'); if (!controller) continue; controller++; controllerpath = must_make_path(cgroup_root, controller, NULL); if (dir_exists(controllerpath)) continue; ret = mkdir(controllerpath, 0755); if (ret < 0) log_error_errno(goto on_error, errno, "Error creating cgroup path: %s", controllerpath); ret = cg_mount_cgroup_full(h, controllerpath); if (ret < 0) goto on_error; } retval = true; on_error: return retval; } static int cgfsng_num_hierarchies(struct cgroup_ops *ops) { int i = 0; if (!ops) return ret_set_errno(-1, ENOENT); if (!ops->hierarchies) return 0; for (; ops->hierarchies[i]; i++) ; return i; } static bool cgfsng_get_hierarchies(struct cgroup_ops *ops, int n, char ***out) { int i; if (!ops) return ret_set_errno(false, ENOENT); if (!ops->hierarchies) return false; /* sanity check n */ for (i = 0; i < n; i++) if (!ops->hierarchies[i]) return ret_set_errno(false, ENOENT); *out = ops->hierarchies[i]->controllers; return true; } static bool cgfsng_get(struct cgroup_ops *ops, const char *controller, const char *cgroup, const char *file, char **value) { __do_free char *path = NULL; struct hierarchy *h; h = ops->get_hierarchy(ops, controller); if (!h) return false; path = must_make_path_relative(cgroup, file, NULL); *value = readat_file(h->fd, path); return *value != NULL; } static int cgfsng_get_memory(struct cgroup_ops *ops, const char *cgroup, const char *file, char **value) { __do_free char *path = NULL; struct hierarchy *h; int cgroup2_root_fd, layout, ret; h = ops->get_hierarchy(ops, "memory"); if (!h) return -1; if (!is_unified_hierarchy(h)) { if (strcmp(file, "memory.max") == 0) file = "memory.limit_in_bytes"; else if (strcmp(file, "memory.swap.max") == 0) file = "memory.memsw.limit_in_bytes"; else if (strcmp(file, "memory.swap.current") == 0) file = "memory.memsw.usage_in_bytes"; else if (strcmp(file, "memory.current") == 0) file = "memory.usage_in_bytes"; layout = CGROUP_SUPER_MAGIC; cgroup2_root_fd = -EBADF; } else { layout = CGROUP2_SUPER_MAGIC; cgroup2_root_fd = ops->cgroup2_root_fd; } path = must_make_path_relative(cgroup, NULL); ret = cgroup_walkup_to_root(cgroup2_root_fd, h->fd, path, file, value); if (ret < 0) return ret; if (ret == 1) { *value = strdup(""); if (!*value) return -ENOMEM; } return layout; } static int cgfsng_get_memory_stats_fd(struct cgroup_ops *ops, const char *cgroup) { __do_free char *path = NULL; struct hierarchy *h; h = ops->get_hierarchy(ops, "memory"); if (!h) return -1; path = must_make_path_relative(cgroup, "memory.stat", NULL); return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); } static int cgfsng_get_memory_current(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_memory(ops, cgroup, "memory.current", value); } static int cgfsng_get_memory_swap_current(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_memory(ops, cgroup, "memory.swap.current", value); } static int cgfsng_get_memory_max(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_memory(ops, cgroup, "memory.max", value); } static int cgfsng_get_memory_swappiness(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_memory(ops, cgroup, "memory.swappiness", value); } static int cgfsng_get_memory_swap_max(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_memory(ops, cgroup, "memory.swap.max", value); } static int cgfsng_get_memory_slabinfo_fd(struct cgroup_ops *ops, const char *cgroup) { __do_free char *path = NULL; struct hierarchy *h; h = ops->get_hierarchy(ops, "memory"); if (!h) return -1; if (faccessat(h->fd, "memory.kmem.slabinfo", F_OK, 0)) return -1; path = must_make_path_relative(cgroup, "memory.kmem.slabinfo", NULL); return openat(h->fd, path, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); } static bool cgfsng_can_use_swap(struct cgroup_ops *ops, const char *cgroup) { __do_free char *cgroup_rel = NULL, *junk_value = NULL; const char *file; struct hierarchy *h; bool ret; h = ops->get_hierarchy(ops, "memory"); if (!h) return false; cgroup_rel = must_make_path_relative(cgroup, NULL); file = is_unified_hierarchy(h) ? "memory.swap.current" : "memory.memsw.usage_in_bytes"; /* For v2, we need to look at the lower levels of the hierarchy because * no 'memory.swap.current' file exists at the root. We must search * upwards in the hierarchy in case memory accounting is disabled via * cgroup.subtree_control for the given cgroup itself. */ if (is_cgroup2_fd(h->fd) && strcmp(cgroup, "/") == 0) { /* * It looks like LXCFS sits in the root cgroup, * which means that we have to find *some* cgroup * down the tree and check a (file) presence in there. * * Note, that this only needed for cgroup2. */ __do_close int fd = -EBADF; __do_closedir DIR *dir = NULL; struct dirent *dent; fd = dup(h->fd); if (fd < 0) return false; dir = fdopendir(fd); if (!dir) { lxcfs_error("Failed to open memory cgroup hierarchy\n"); return false; } /* Transfer ownership to fdopendir(). */ move_fd(fd); ret = false; while (((dent = readdir(dir)) != NULL)) { if (strcmp(dent->d_name, ".") == 0 || strcmp(dent->d_name, "..") == 0) continue; if (dent->d_type == DT_DIR) { __do_free char *path; path = must_make_path_relative(dent->d_name, "memory.swap.current", NULL); if (!faccessat(h->fd, path, F_OK, 0)) { /* We found it. Exit. */ ret = true; break; } } } } else { /* * We can check a (file) presence on the current * level and go up in the cgroup tree if needed. */ ret = cgroup_walkup_to_root(ops->cgroup2_root_fd, h->fd, cgroup_rel, file, &junk_value) == 0; } return ret; } static int cgfsng_get_memory_stats(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_memory(ops, cgroup, "memory.stat", value); } static char *readat_cpuset(int cgroup_fd) { __do_free char *val = NULL; val = readat_file(cgroup_fd, "cpuset.cpus"); if (val && strcmp(val, "") != 0) return move_ptr(val); free_disarm(val); val = readat_file(cgroup_fd, "cpuset.cpus.effective"); if (val && strcmp(val, "") != 0) return move_ptr(val); return NULL; } static int cgfsng_get_cpuset_cpus(struct cgroup_ops *ops, const char *cgroup, char **value) { __do_close int cgroup_fd = -EBADF; __do_free char *path = NULL; char *v; struct hierarchy *h; int ret; h = ops->get_hierarchy(ops, "cpuset"); if (!h) return -1; if (!is_unified_hierarchy(h)) ret = CGROUP_SUPER_MAGIC; else ret = CGROUP2_SUPER_MAGIC; *value = NULL; path = must_make_path_relative(cgroup, NULL); cgroup_fd = openat_safe(h->fd, path); if (cgroup_fd < 0) return -1; v = readat_cpuset(cgroup_fd); if (v) { *value = v; return ret; } /* * cpuset.cpus and cpuset.cpus.effective are empty so we need to look * the nearest ancestor with a non-empty cpuset.cpus{.effective} file. */ for (;;) { int fd; fd = openat_safe(cgroup_fd, "../"); if (fd < 0 || !is_cgroup_fd(fd)) return -1; close_prot_errno_replace(cgroup_fd, fd); v = readat_cpuset(fd); if (v) { *value = v; return ret; } } return -1; } static int cgfsng_get_io(struct cgroup_ops *ops, const char *cgroup, const char *file, char **value) { __do_free char *path = NULL; struct hierarchy *h; int ret; h = ops->get_hierarchy(ops, "blkio"); if (!h) return -1; if (!is_unified_hierarchy(h)) ret = CGROUP_SUPER_MAGIC; else ret = CGROUP2_SUPER_MAGIC; path = must_make_path_relative(cgroup, file, NULL); *value = readat_file(h->fd, path); if (!*value) { if (errno == ENOENT) errno = EOPNOTSUPP; return ret_errno(errno); } return ret; } static int cgfsng_get_io_service_bytes(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_io(ops, cgroup, "blkio.io_service_bytes_recursive", value); } static int cgfsng_get_io_service_time(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_io(ops, cgroup, "blkio.io_service_time_recursive", value); } static int cgfsng_get_io_serviced(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_io(ops, cgroup, "blkio.io_serviced_recursive", value); } static int cgfsng_get_io_merged(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_io(ops, cgroup, "blkio.io_merged_recursive", value); } static int cgfsng_get_io_wait_time(struct cgroup_ops *ops, const char *cgroup, char **value) { return cgfsng_get_io(ops, cgroup, "blkio.io_wait_time_recursive", value); } static bool cgfsng_can_use_cpuview(struct cgroup_ops *ops) { struct hierarchy *cpu, *cpuacct; if (pure_unified_layout(ops)) return true; cpu = ops->get_hierarchy(ops, "cpu"); if (!cpu || is_unified_hierarchy(cpu)) return false; cpuacct = ops->get_hierarchy(ops, "cpuacct"); if (!cpuacct || is_unified_hierarchy(cpuacct)) return false; return true; } /* At startup, parse_hierarchies finds all the info we need about cgroup * mountpoints and current cgroups, and stores it in @d. */ static int cg_hybrid_init(struct cgroup_ops *ops) { __do_free char *basecginfo = NULL; __do_free char *line = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; int ret; size_t len = 0; char **klist = NULL, **nlist = NULL; /* Root spawned containers escape the current cgroup, so use init's * cgroups as our base in that case. */ basecginfo = read_file("/proc/1/cgroup"); if (!basecginfo) return ret_set_errno(-1, ENOMEM); ret = get_existing_subsystems(&klist, &nlist); if (ret < 0) return log_error_errno(-1, errno, "Failed to retrieve available legacy cgroup controllers"); f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); if (!f) return log_error_errno(-1, errno, "Failed to open \"/proc/self/mountinfo\""); while (getline(&line, &len, f) != -1) { int type; struct hierarchy *new; char *base_cgroup = NULL, *mountpoint = NULL; char **controller_list = NULL; __do_free char *controllers = NULL; type = get_cgroup_version(line); if (type == 0) continue; if (type == CGROUP2_SUPER_MAGIC && ops->unified) continue; if (ops->cgroup_layout == CGROUP_LAYOUT_UNKNOWN) { if (type == CGROUP2_SUPER_MAGIC) ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; else if (type == CGROUP_SUPER_MAGIC) ops->cgroup_layout = CGROUP_LAYOUT_LEGACY; } else if (ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED) { if (type == CGROUP_SUPER_MAGIC) ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; } else if (ops->cgroup_layout == CGROUP_LAYOUT_LEGACY) { if (type == CGROUP2_SUPER_MAGIC) ops->cgroup_layout = CGROUP_LAYOUT_HYBRID; } controller_list = cg_hybrid_get_controllers(klist, nlist, line, type, &controllers); if (!controller_list && type == CGROUP_SUPER_MAGIC) continue; if (type == CGROUP_SUPER_MAGIC) if (controller_list_is_dup(ops->hierarchies, controller_list)) ret_set_errno(goto next, EEXIST); mountpoint = cg_hybrid_get_mountpoint(line); if (!mountpoint) log_error_errno(goto next, EINVAL, "Failed parsing mountpoint from \"%s\"", line); if (type == CGROUP_SUPER_MAGIC) base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, controller_list[0], CGROUP_SUPER_MAGIC); else base_cgroup = cg_hybrid_get_current_cgroup(basecginfo, NULL, CGROUP2_SUPER_MAGIC); if (!base_cgroup) log_error_errno(goto next, EINVAL, "Failed to find current cgroup %s", mountpoint); trim(base_cgroup); prune_init_scope(base_cgroup); if (type == CGROUP2_SUPER_MAGIC) { char *cgv2_ctrl_path; cgv2_ctrl_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL); controller_list = cg_unified_get_controllers(cgv2_ctrl_path); free(cgv2_ctrl_path); if (!controller_list) controller_list = cg_unified_make_empty_controller(); } new = add_hierarchy(&ops->hierarchies, controller_list, mountpoint, base_cgroup, type); new->__controllers = move_ptr(controllers); if (type == CGROUP2_SUPER_MAGIC && !ops->unified) ops->unified = new; continue; next: free_string_list(controller_list); free(mountpoint); free(base_cgroup); } free_string_list(klist); free_string_list(nlist); return 0; } static int cg_unified_init(struct cgroup_ops *ops) { __do_free char *subtree_path = NULL; int ret; char *mountpoint; char **delegatable; struct hierarchy *new; char *base_cgroup = NULL; ret = unified_cgroup_hierarchy(); if (ret == -ENOMEDIUM) return ret_errno(ENOMEDIUM); if (ret != CGROUP2_SUPER_MAGIC) return 0; base_cgroup = cg_unified_get_current_cgroup(1); if (!base_cgroup) return ret_errno(EINVAL); prune_init_scope(base_cgroup); /* * We assume that the cgroup we're currently in has been delegated to * us and we are free to further delege all of the controllers listed * in cgroup.controllers further down the hierarchy. */ mountpoint = must_copy_string(DEFAULT_CGROUP_MOUNTPOINT); subtree_path = must_make_path(mountpoint, base_cgroup, "cgroup.controllers", NULL); delegatable = cg_unified_get_controllers(subtree_path); if (!delegatable) delegatable = cg_unified_make_empty_controller(); /* TODO: If the user requested specific controllers via lxc.cgroup.use * we should verify here. The reason I'm not doing it right is that I'm * not convinced that lxc.cgroup.use will be the future since it is a * global property. I much rather have an option that lets you request * controllers per container. */ new = add_hierarchy(&ops->hierarchies, delegatable, mountpoint, base_cgroup, CGROUP2_SUPER_MAGIC); ops->cgroup_layout = CGROUP_LAYOUT_UNIFIED; ops->unified = new; ops->cgroup2_root_fd = open(DEFAULT_CGROUP_MOUNTPOINT, O_DIRECTORY | O_PATH | O_CLOEXEC); if (ops->cgroup2_root_fd < 0) return -errno; return CGROUP2_SUPER_MAGIC; } static int cg_init(struct cgroup_ops *ops) { int ret; ret = cg_unified_init(ops); if (ret < 0) return -1; if (ret == CGROUP2_SUPER_MAGIC) return 0; return cg_hybrid_init(ops); } struct cgroup_ops *cgfsng_ops_init(void) { __do_free struct cgroup_ops *cgfsng_ops = NULL; cgfsng_ops = zalloc(sizeof(struct cgroup_ops)); if (!cgfsng_ops) return ret_set_errno(NULL, ENOMEM); cgfsng_ops->cgroup_layout = CGROUP_LAYOUT_UNKNOWN; cgfsng_ops->mntns_fd = -EBADF; cgfsng_ops->cgroup2_root_fd = -EBADF; if (cg_init(cgfsng_ops)) return NULL; cgfsng_ops->num_hierarchies = cgfsng_num_hierarchies; cgfsng_ops->get = cgfsng_get; cgfsng_ops->get_hierarchies = cgfsng_get_hierarchies; cgfsng_ops->get_hierarchy = cgfsng_get_hierarchy; cgfsng_ops->driver = "cgfsng"; cgfsng_ops->version = "1.0.0"; cgfsng_ops->mount = cgfsng_mount; /* memory */ cgfsng_ops->get_memory_stats_fd = cgfsng_get_memory_stats_fd; cgfsng_ops->get_memory_stats = cgfsng_get_memory_stats; cgfsng_ops->get_memory_max = cgfsng_get_memory_max; cgfsng_ops->get_memory_swappiness = cgfsng_get_memory_swappiness; cgfsng_ops->get_memory_swap_max = cgfsng_get_memory_swap_max; cgfsng_ops->get_memory_current = cgfsng_get_memory_current; cgfsng_ops->get_memory_swap_current = cgfsng_get_memory_swap_current; cgfsng_ops->get_memory_slabinfo_fd = cgfsng_get_memory_slabinfo_fd; cgfsng_ops->can_use_swap = cgfsng_can_use_swap; /* cpuset */ cgfsng_ops->get_cpuset_cpus = cgfsng_get_cpuset_cpus; cgfsng_ops->can_use_cpuview = cgfsng_can_use_cpuview; /* blkio */ cgfsng_ops->get_io_service_bytes = cgfsng_get_io_service_bytes; cgfsng_ops->get_io_service_time = cgfsng_get_io_service_time; cgfsng_ops->get_io_serviced = cgfsng_get_io_serviced; cgfsng_ops->get_io_merged = cgfsng_get_io_merged; cgfsng_ops->get_io_wait_time = cgfsng_get_io_wait_time; return move_ptr(cgfsng_ops); } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroups/cgroup.c0000664000175000017500000000423114773561567017047 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../macro.h" #include "../memory_utils.h" #include "../utils.h" #include "cgroup.h" #include "cgroup_utils.h" extern struct cgroup_ops *cgfsng_ops_init(void); struct cgroup_ops *cgroup_init(void) { struct cgroup_ops *ops; ops = cgfsng_ops_init(); if (!ops) return log_error_errno(NULL, errno, "Failed to initialize cgroup driver"); return ops; } void cgroup_exit(struct cgroup_ops *ops) { if (!ops) return; for (struct hierarchy **it = ops->hierarchies; it && *it; it++) { for (char **p = (*it)->controllers; p && *p; p++) free(*p); free((*it)->controllers); free((*it)->__controllers); if ((*it)->fd >= 0) close((*it)->fd); free((*it)->mountpoint); free((*it)->base_path); free(*it); } if (ops->mntns_fd >= 0) close(ops->mntns_fd); if (ops->cgroup2_root_fd >= 0) close(ops->cgroup2_root_fd); free(ops->hierarchies); free(ops); return; } #define INIT_SCOPE "/init.scope" void prune_init_scope(char *cg) { char *point; if (!cg) return; point = cg + strlen(cg) - strlen(INIT_SCOPE); if (point < cg) return; if (strcmp(point, INIT_SCOPE) == 0) { if (point == cg) *(point + 1) = '\0'; else *point = '\0'; } } char *get_pid_cgroup(pid_t pid, const char *contrl) { int cfd; cfd = get_cgroup_fd(contrl); if (cfd < 0) return NULL; if (pure_unified_layout(cgroup_ops)) return cg_unified_get_current_cgroup(pid); return cg_legacy_get_current_cgroup(pid, contrl); } /* * Read the cpuset.cpus for cg * Return the answer in a newly allocated string which must be freed */ char *get_cpuset(const char *cg) { char *value = NULL; int ret; ret = cgroup_ops->get_cpuset_cpus(cgroup_ops, cg, &value); if (ret < 0) return NULL; return value; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroups/cgroup.h0000664000175000017500000001437614773561567017067 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXC_CGROUP_H #define __LXC_CGROUP_H #include "config.h" #include #include #include #include #include "../macro.h" #include "../memory_utils.h" #define DEFAULT_CGROUP_MOUNTPOINT "/sys/fs/cgroup" typedef enum { CGROUP_LAYOUT_UNKNOWN = -1, CGROUP_LAYOUT_LEGACY = 0, CGROUP_LAYOUT_HYBRID = 1, CGROUP_LAYOUT_UNIFIED = 2, } cgroup_layout_t; /* A descriptor for a mounted hierarchy * * @controllers * - legacy hierarchy * Either NULL, or a null-terminated list of all the co-mounted controllers. * - unified hierarchy * Either NULL, or a null-terminated list of all enabled controllers. * * @mountpoint * - The mountpoint we will use. * - legacy hierarchy * It will be either /sys/fs/cgroup/controller or * /sys/fs/cgroup/controllerlist. * - unified hierarchy * It will either be /sys/fs/cgroup or /sys/fs/cgroup/ * depending on whether this is a hybrid cgroup layout (mix of legacy and * unified hierarchies) or a pure unified cgroup layout. * * @container_base_path * - The cgroup under which the container cgroup path * is created. This will be either the caller's cgroup (if not root), or * init's cgroup (if root). * * @container_full_path * - The full path to the containers cgroup. * * @monitor_full_path * - The full path to the monitor's cgroup. * * @version * - legacy hierarchy * If the hierarchy is a legacy hierarchy this will be set to * CGROUP_SUPER_MAGIC. * - unified hierarchy * If the hierarchy is a unified hierarchy this will be set to * CGROUP2_SUPER_MAGIC. */ struct hierarchy { /* * cgroup2 only: what files need to be chowned to delegate a cgroup to * an unprivileged user. */ char **controllers; char *__controllers; char *mountpoint; char *base_path; int version; /* cgroup2 only */ unsigned int bpf_device_controller:1; int fd; }; struct cgroup_ops { /* * File descriptor of the mount namespace the cgroup hierarchies are * mounted in. */ int mntns_fd; /* * A file descriptor to the root of the cgroup tree if we're on a * cgroup2 only system. */ int cgroup2_root_fd; /* string constant */ const char *driver; /* string constant */ const char *version; /* @hierarchies * - A NULL-terminated array of struct hierarchy, one per legacy * hierarchy. No duplicates. First sufficient, writeable mounted * hierarchy wins. */ struct hierarchy **hierarchies; /* Pointer to the unified hierarchy. Do not free! */ struct hierarchy *unified; /* * @cgroup_layout * - What cgroup layout the container is running with. * - CGROUP_LAYOUT_UNKNOWN * The cgroup layout could not be determined. This should be treated * as an error condition. * - CGROUP_LAYOUT_LEGACY * The container is running with all controllers mounted into legacy * cgroup hierarchies. * - CGROUP_LAYOUT_HYBRID * The container is running with at least one controller mounted * into a legacy cgroup hierarchy and a mountpoint for the unified * hierarchy. The unified hierarchy can be empty (no controllers * enabled) or non-empty (controllers enabled). * - CGROUP_LAYOUT_UNIFIED * The container is running on a pure unified cgroup hierarchy. The * unified hierarchy can be empty (no controllers enabled) or * non-empty (controllers enabled). */ cgroup_layout_t cgroup_layout; int (*num_hierarchies)(struct cgroup_ops *ops); bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out); bool (*mount)(struct cgroup_ops *ops, const char *root); struct hierarchy *(*get_hierarchy)(struct cgroup_ops *ops, const char *controller); bool (*get)(struct cgroup_ops *ops, const char *controller, const char *cgroup, const char *file, char **value); /* memory */ int (*get_memory_stats_fd)(struct cgroup_ops *ops, const char *cgroup); int (*get_memory_stats)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_memory_current)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_memory_swap_current)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_memory_max)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_memory_swappiness)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_memory_swap_max)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_memory_slabinfo_fd)(struct cgroup_ops *ops, const char *cgroup); bool (*can_use_swap)(struct cgroup_ops *ops, const char *cgroup); /* cpuset */ int (*get_cpuset_cpus)(struct cgroup_ops *ops, const char *cgroup, char **value); bool (*can_use_cpuview)(struct cgroup_ops *ops); /* io */ int (*get_io_service_bytes)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_io_service_time)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_io_serviced)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_io_merged)(struct cgroup_ops *ops, const char *cgroup, char **value); int (*get_io_wait_time)(struct cgroup_ops *ops, const char *cgroup, char **value); }; extern struct cgroup_ops *cgroup_ops; extern struct cgroup_ops *cgroup_init(void); extern void cgroup_exit(struct cgroup_ops *ops); extern void prune_init_scope(char *cg); static inline void __auto_cgroup_exit__(struct cgroup_ops **ops) { if (*ops) cgroup_exit(*ops); } extern int cgroup_attach(const char *name, const char *lxcpath, int64_t pid); #define __do_cgroup_exit __attribute__((__cleanup__(__auto_cgroup_exit__))) static inline bool pure_unified_layout(const struct cgroup_ops *ops) { return ops->cgroup_layout == CGROUP_LAYOUT_UNIFIED; } static inline bool is_unified_hierarchy(const struct hierarchy *h) { return h->version == CGROUP2_SUPER_MAGIC; } static inline bool is_unified_controller(int version) { return version == CGROUP2_SUPER_MAGIC; } static inline int get_cgroup_fd(const char *controller) { struct hierarchy *h; h = cgroup_ops->get_hierarchy(cgroup_ops, controller); return h ? h->fd : -EBADF; } extern char *get_pid_cgroup(pid_t pid, const char *contrl); extern char *get_cpuset(const char *cg); #endif ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroups/cgroup_utils.c0000664000175000017500000004326214773561567020276 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include "../macro.h" #include "../memory_utils.h" #include "../utils.h" #include "cgroup.h" #include "cgroup_utils.h" int get_cgroup_version(char *line) { if (is_cgroupfs_v1(line)) return CGROUP_SUPER_MAGIC; if (is_cgroupfs_v2(line)) return CGROUP2_SUPER_MAGIC; return 0; } bool is_cgroupfs_v1(char *line) { char *p = strstr(line, " - "); if (!p) return false; return strncmp(p, " - cgroup ", 10) == 0; } bool is_cgroupfs_v2(char *line) { char *p = strstr(line, " - "); if (!p) return false; return strncmp(p, " - cgroup2 ", 11) == 0; } int unified_cgroup_hierarchy(void) { int ret; struct statfs fs; ret = statfs(DEFAULT_CGROUP_MOUNTPOINT, &fs); if (ret < 0) return -ENOMEDIUM; if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC)) return CGROUP2_SUPER_MAGIC; return 0; } bool is_cgroup_fd(int fd) { int ret; struct statfs fs; ret = fstatfs(fd, &fs); if (ret) return false; if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC) || is_fs_type(&fs, CGROUP_SUPER_MAGIC)) return true; return false; } bool is_cgroup2_fd(int fd) { int ret; struct statfs fs; ret = fstatfs(fd, &fs); if (ret) return false; if (is_fs_type(&fs, CGROUP2_SUPER_MAGIC)) return true; return false; } void *must_realloc(void *orig, size_t sz) { void *ret; do { ret = realloc(orig, sz); } while (!ret); return ret; } char *must_make_path(const char *first, ...) { va_list args; char *cur, *dest; size_t full_len = strlen(first); size_t buf_len; size_t cur_len; dest = must_copy_string(first); cur_len = full_len; va_start(args, first); while ((cur = va_arg(args, char *)) != NULL) { buf_len = strlen(cur); full_len += buf_len; if (cur[0] != '/') full_len++; dest = must_realloc(dest, full_len + 1); if (cur[0] != '/') { memcpy(dest + cur_len, "/", 1); cur_len++; } memcpy(dest + cur_len, cur, buf_len); cur_len += buf_len; } va_end(args); dest[cur_len] = '\0'; return dest; } bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val) { return (fs->f_type == (fs_type_magic)magic_val); } char *must_copy_string(const char *entry) { char *ret; if (!entry) return NULL; do { ret = strdup(entry); } while (!ret); return ret; } char *lxc_string_join(const char *sep, const char **parts, bool use_as_prefix) { char *result; char **p; size_t sep_len = strlen(sep); size_t result_len = use_as_prefix * sep_len; size_t buf_len; /* calculate new string length */ for (p = (char **)parts; *p; p++) result_len += (p > (char **)parts) * sep_len + strlen(*p); buf_len = result_len + 1; result = calloc(buf_len, 1); if (!result) return NULL; if (use_as_prefix) (void)strlcpy(result, sep, buf_len); for (p = (char **)parts; *p; p++) { if (p > (char **)parts) (void)strlcat(result, sep, buf_len); (void)strlcat(result, *p, buf_len); } return result; } int lxc_count_file_lines(const char *fn) { __do_fclose FILE *f = NULL; __do_free char *line = NULL; size_t sz = 0; int n = 0; f = fopen_cloexec(fn, "r"); if (!f) return -1; while (getline(&line, &sz, f) != -1) n++; return n; } bool dir_exists(const char *path) { struct stat sb; int ret; ret = stat(path, &sb); if (ret < 0) /* Could be something other than eexist, just say "no". */ return false; return S_ISDIR(sb.st_mode); } /* * @path: a pathname where / replaced with '\0'. * @offsetp: pointer to int showing which path segment was last seen. * Updated on return to reflect the next segment. * @fulllen: full original path length. * Returns a pointer to the next path segment, or NULL if done. */ static char *get_nextpath(char *path, int *offsetp, int fulllen) { int offset = *offsetp; if (offset >= fulllen) return NULL; while (offset < fulllen && path[offset] != '\0') offset++; while (offset < fulllen && path[offset] == '\0') offset++; *offsetp = offset; return (offset < fulllen) ? &path[offset] : NULL; } /* * Check that @subdir is a subdir of @dir. @len is the length of * @dir (to avoid having to recalculate it). */ static bool is_subdir(const char *subdir, const char *dir, size_t len) { size_t subdirlen = strlen(subdir); if (subdirlen < len) return false; if (strncmp(subdir, dir, len) != 0) return false; if (dir[len-1] == '/') return true; if (subdir[len] == '/' || subdirlen == len) return true; return false; } /* * Check if the open fd is a symlink. Return -ELOOP if it is. Return * -ENOENT if we couldn't fstat. Return 0 if the fd is ok. */ static int check_symlink(int fd) { struct stat sb; int ret; ret = fstat(fd, &sb); if (ret < 0) return -ENOENT; if (S_ISLNK(sb.st_mode)) return -ELOOP; return 0; } /* * Open a file or directory, provided that it contains no symlinks. * * CAVEAT: This function must not be used for other purposes than container * setup before executing the container's init */ static int open_if_safe(int dirfd, const char *nextpath) { __do_close int newfd = -EBADF; newfd = openat(dirfd, nextpath, O_RDONLY | O_CLOEXEC | O_NOFOLLOW); if (newfd >= 0) /* Was not a symlink, all good. */ return move_fd(newfd); if (errno == ELOOP) return -1; if (errno == EPERM || errno == EACCES) { /* We're not root (cause we got EPERM) so try opening with * O_PATH. */ newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW); if (newfd >= 0) { /* O_PATH will return an fd for symlinks. We know * nextpath wasn't a symlink at last openat, so if fd is * now a link, then something * fishy is going on. */ int ret = check_symlink(newfd); if (ret < 0) return -1; } } return move_fd(newfd); } /* * Open a path intending for mounting, ensuring that the final path * is inside the container's rootfs. * * CAVEAT: This function must not be used for other purposes than container * setup before executing the container's init * * @target: path to be opened * @prefix_skip: a part of @target in which to ignore symbolic links. This * would be the container's rootfs. * * Return an open fd for the path, or <0 on error. */ static int open_without_symlink(const char *target, const char *prefix_skip) { __do_close int dirfd = -EBADF; __do_free char *dup = NULL; int curlen = 0, fulllen, i; fulllen = strlen(target); /* make sure prefix-skip makes sense */ if (prefix_skip && strlen(prefix_skip) > 0) { curlen = strlen(prefix_skip); if (!is_subdir(target, prefix_skip, curlen)) return -EINVAL; /* * get_nextpath() expects the curlen argument to be * on a (turned into \0) / or before it, so decrement * curlen to make sure that happens */ if (curlen) curlen--; } else { prefix_skip = "/"; curlen = 0; } /* Make a copy of target which we can hack up, and tokenize it */ dup = strdup(target); if (!dup) return ret_errno(ENOMEM); for (i = 0; i < fulllen; i++) { if (dup[i] == '/') dup[i] = '\0'; } dirfd = open(prefix_skip, O_RDONLY); if (dirfd < 0) return -1; for (;;) { int newfd; char *nextpath; nextpath = get_nextpath(dup, &curlen, fulllen); if (!nextpath) return move_fd(dirfd); newfd = open_if_safe(dirfd, nextpath); close_prot_errno_disarm(dirfd); dirfd = newfd; if (newfd < 0) return -1; } return move_fd(dirfd); } /* * Safely mount a path into a container, ensuring that the mount target * is under the container's @rootfs. (If @rootfs is NULL, then the container * uses the host's /) * * CAVEAT: This function must not be used for other purposes than container * setup before executing the container's init */ int safe_mount(const char *src, const char *dest, const char *fstype, unsigned long flags, const void *data, const char *rootfs) { __do_close int destfd = -EBADF, srcfd = -EBADF; int ret; /* Only needs enough for /proc/self/fd/. */ char srcbuf[50], destbuf[50]; const char *mntsrc = src; if (!rootfs) rootfs = ""; /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */ if (flags & MS_BIND && src && src[0] != '/') { srcfd = open_without_symlink(src, NULL); if (srcfd < 0) return srcfd; ret = snprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd); if (ret < 0 || ret >= (int)sizeof(srcbuf)) return -EINVAL; mntsrc = srcbuf; } destfd = open_without_symlink(dest, rootfs); if (destfd < 0) return -1; ret = snprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd); if (ret < 0 || ret >= (int)sizeof(destbuf)) return ret_errno(EINVAL); ret = mount(mntsrc, destbuf, fstype, flags, data); if (ret < 0) return -1; return 0; } FILE *fopen_cloexec(const char *path, const char *mode) { __do_close int fd = -EBADF; __do_fclose FILE *ret = NULL; int open_mode = 0; int step = 0; if (!strncmp(mode, "r+", 2)) { open_mode = O_RDWR; step = 2; } else if (!strncmp(mode, "r", 1)) { open_mode = O_RDONLY; step = 1; } else if (!strncmp(mode, "w+", 2)) { open_mode = O_RDWR | O_TRUNC | O_CREAT; step = 2; } else if (!strncmp(mode, "w", 1)) { open_mode = O_WRONLY | O_TRUNC | O_CREAT; step = 1; } else if (!strncmp(mode, "a+", 2)) { open_mode = O_RDWR | O_CREAT | O_APPEND; step = 2; } else if (!strncmp(mode, "a", 1)) { open_mode = O_WRONLY | O_CREAT | O_APPEND; step = 1; } for (; mode[step]; step++) if (mode[step] == 'x') open_mode |= O_EXCL; open_mode |= O_CLOEXEC; fd = open(path, open_mode, 0660); if (fd < 0) return NULL; ret = fdopen(fd, mode); if (!ret) return NULL; move_fd(fd); return move_ptr(ret); } /* Given a multi-line string, return a null-terminated copy of the current line. */ static char *copy_to_eol(char *p) { char *p2 = strchr(p, '\n'), *sret; size_t len; if (!p2) return NULL; len = p2 - p; sret = must_realloc(NULL, len + 1); memcpy(sret, p, len); sret[len] = '\0'; return sret; } static void batch_realloc(char **mem, size_t oldlen, size_t newlen) { int newbatches = (newlen / BATCH_SIZE) + 1; int oldbatches = (oldlen / BATCH_SIZE) + 1; if (!*mem || newbatches > oldbatches) { *mem = must_realloc(*mem, newbatches * BATCH_SIZE); } } void append_line(char **dest, size_t oldlen, char *new, size_t newlen) { size_t full = oldlen + newlen; batch_realloc(dest, oldlen, full + 1); memcpy(*dest + oldlen, new, newlen + 1); } static inline void drop_trailing_newlines(char *s) { int l; for (l = strlen(s); l > 0 && s[l - 1] == '\n'; l--) s[l - 1] = '\0'; } /* Slurp in a whole file */ char *read_file(const char *fnam) { __do_free char *line = NULL; __do_fclose FILE *f = NULL; int linelen; char *buf = NULL; size_t len = 0, fulllen = 0; f = fopen(fnam, "re"); if (!f) return NULL; while ((linelen = getline(&line, &len, f)) != -1) { append_line(&buf, fulllen, line, linelen); fulllen += linelen; } return buf; } char *read_file_strip_newline(const char *fnam) { char *buf; buf = read_file(fnam); if (buf) drop_trailing_newlines(buf); return buf; } /* Get current cgroup from /proc/self/cgroup for the cgroupfs v2 hierarchy. */ char *cg_unified_get_current_cgroup(pid_t pid) { __do_free char *basecginfo = NULL; char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 1]; char *base_cgroup; snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ? pid : 1); basecginfo = read_file(path); if (!basecginfo) return NULL; base_cgroup = strstr(basecginfo, "0::/"); if (!base_cgroup) return NULL; base_cgroup = base_cgroup + 3; return copy_to_eol(base_cgroup); } /* cgline: pointer to character after the first ':' in a line in a \n-terminated * /proc/self/cgroup file. Check whether controller c is present. */ static bool controller_in_clist(char *cgline, const char *c) { __do_free char *tmp = NULL; char *tok, *eol; size_t len; eol = strchr(cgline, ':'); if (!eol) return false; len = eol - cgline; tmp = must_realloc(NULL, len + 1); memcpy(tmp, cgline, len); tmp[len] = '\0'; lxc_iterate_parts(tok, tmp, ",") if (strcmp(tok, c) == 0) return true; return false; } /* @basecginfo is a copy of /proc/$$/cgroup. Return the current cgroup for * @controller. */ char *cg_hybrid_get_current_cgroup(char *basecginfo, const char *controller, int type) { char *p = basecginfo; for (;;) { bool is_cgv2_base_cgroup = false; /* cgroup v2 entry in "/proc//cgroup": "0::/some/path" */ if ((type == CGROUP2_SUPER_MAGIC) && (*p == '0')) is_cgv2_base_cgroup = true; p = strchr(p, ':'); if (!p) return NULL; p++; if (is_cgv2_base_cgroup || (controller && controller_in_clist(p, controller))) { p = strchr(p, ':'); if (!p) return NULL; p++; return copy_to_eol(p); } p = strchr(p, '\n'); if (!p) return NULL; p++; } } char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller) { __do_free char *basecginfo = NULL; char path[STRLITERALLEN("/proc//cgroup") + INTTYPE_TO_STRLEN(pid_t) + 1]; snprintf(path, sizeof(path), "/proc/%d/cgroup", pid > 0 ? pid : 1); basecginfo = read_file(path); if (!basecginfo) return ret_set_errno(NULL, ENOMEM); return cg_hybrid_get_current_cgroup(basecginfo, controller, CGROUP_SUPER_MAGIC); } char *readat_file(int dirfd, const char *path) { __do_close int fd = -EBADF; __do_free char *line = NULL; __do_fclose FILE *f = NULL; char *buf = NULL; size_t len = 0, fulllen = 0; ssize_t linelen; fd = openat(dirfd, path, O_NOFOLLOW | O_RDONLY | O_CLOEXEC); if (fd < 0) return NULL; f = fdopen(fd, "re"); if (!f) return NULL; /* Transfer ownership of fd */ move_fd(fd); while ((linelen = getline(&line, &len, f)) != -1) { append_line(&buf, fulllen, line, linelen); fulllen += linelen; } if (buf) drop_trailing_newlines(buf); return buf; } bool mkdir_p(const char *dir, mode_t mode) { const char *tmp = dir; const char *orig = dir; char *makeme; do { dir = tmp + strspn(tmp, "/"); tmp = dir + strcspn(dir, "/"); makeme = strndup(orig, dir - orig); if (!makeme) return false; if (mkdir(makeme, mode) && errno != EEXIST) { lxcfs_error("Failed to create directory '%s': %s.\n", makeme, strerror(errno)); free(makeme); return false; } free(makeme); } while(tmp != dir); return true; } static bool same_file(int fd1, int fd2) { struct stat st1, st2; if (fstat(fd1, &st1) < 0 || fstat(fd2, &st2) < 0) return false; return (st1.st_dev == st2.st_dev) && (st1.st_ino == st2.st_ino); } /** * cgroup_walkup_to_root() - Walk upwards to cgroup root to find valid value * * @cgroup2_root_fd: File descriptor for the cgroup2 root mount point. * @hierarchy_fd: File descriptor for the hierarchy. * @cgroup: A cgroup directory relative to @hierarchy_fd. * @file: The file in @cgroup from which to read a value. * @value: Return argument to store value read from @file. * * This function tries to read a valid value from @file in @cgroup in * @hierarchy_fd. If it is a legacy cgroup hierarchy and we fail to find a * valid value we terminate early and report an error. * The cgroup2 hierarchy however, has different semantics. In a few controller * files it will show the value "max" or simply leave it completely empty * thereby indicating that no limit has been set for this particular cgroup. * However, that doesn't mean that there's no limit. A cgroup further up the * hierarchy could have a limit set that also applies to the cgroup we are * interested in. So for the unified cgroup hierarchy we need to keep walking * towards the cgroup2 root cgroup and try to parse a valid value. * * Returns: 0 if a limit was found, 1 if no limit was set or "max" was set, * -errno if an error occurred. */ int cgroup_walkup_to_root(int cgroup2_root_fd, int hierarchy_fd, const char *cgroup, const char *file, char **value) { __do_close int dir_fd = -EBADF; __do_free char *val = NULL; /* Look in our current cgroup for a valid value. */ dir_fd = openat(hierarchy_fd, cgroup, O_DIRECTORY | O_PATH | O_CLOEXEC); if (dir_fd < 0) return -errno; val = readat_file(dir_fd, file); if (!is_empty_string(val) && strcmp(val, "max") != 0) { *value = move_ptr(val); return 0; } if (!is_cgroup2_fd(dir_fd)) return -EINVAL; /* * Legacy cgroup hierarchies should always show a valid value in the * file of the cgroup. So no need to do this upwards walking crap. */ if (cgroup2_root_fd < 0 || !is_cgroup2_fd(cgroup2_root_fd)) return -EINVAL; else if (same_file(cgroup2_root_fd, dir_fd)) return 1; free_disarm(val); /* * Set an arbitraty hard-coded limit to prevent us from ending * up in an endless loop. There really shouldn't be any cgroup * tree that is 1000 levels deep. That would be insane in * principal and performance-wise. */ for (int i = 0; i < 1000; i++) { __do_close int inner_fd = -EBADF; __do_free char *new_val = NULL; inner_fd = move_fd(dir_fd); dir_fd = openat(inner_fd, "..", O_DIRECTORY | O_PATH | O_CLOEXEC); if (dir_fd < 0) return -errno; if (!is_cgroup2_fd(dir_fd)) return log_error_errno(-ELOOP, ELOOP, "Found non-cgroup2 directory during cgroup2 tree walkup. Terminating walk"); /* * We're at the root of the cgroup2 tree so stop walking * upwards. * Since we walked up the whole tree we haven't found an actual * limit anywhere apparently. * * Note that we're not checking the root cgroup itself simply * because a lot of the controllers don't expose files with * limits to the root cgroup. */ if (same_file(cgroup2_root_fd, dir_fd)) return 1; /* We found a valid value. Terminate walk. */ new_val = readat_file(dir_fd, file); if (!is_empty_string(new_val) && strcmp(new_val, "max") != 0) { *value = move_ptr(new_val); return 0; } } return log_error_errno(-ELOOP, ELOOP, "To many nested cgroups or invalid mount tree. Terminating walk"); } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cgroups/cgroup_utils.h0000664000175000017500000000666414773561567020310 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXC_CGROUP_UTILS_H #define __LXC_CGROUP_UTILS_H #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include "../macro.h" #include "../memory_utils.h" /* Retrieve the cgroup version of a given entry from /proc//mountinfo. */ extern int get_cgroup_version(char *line); /* Check if given entry from /proc//mountinfo is a cgroupfs v1 mount. */ extern bool is_cgroupfs_v1(char *line); /* Check if given entry from /proc//mountinfo is a cgroupfs v2 mount. */ extern bool is_cgroupfs_v2(char *line); /* Given a v1 hierarchy @mountpoint and base @path, verify that we can create * directories underneath it. */ extern bool test_writeable_v1(char *mountpoint, char *path); /* Given a v2 hierarchy @mountpoint and base @path, verify that we can create * directories underneath it and that we have write access to the cgroup's * "cgroup.procs" file. */ extern bool test_writeable_v2(char *mountpoint, char *path); extern int unified_cgroup_hierarchy(void); extern void *must_realloc(void *orig, size_t sz); extern char *must_make_path(const char *first, ...); extern char *must_copy_string(const char *entry); /* __typeof__ should be safe to use with all compilers. */ typedef __typeof__(((struct statfs *)NULL)->f_type) fs_type_magic; extern bool is_fs_type(const struct statfs *fs, fs_type_magic magic_val); extern char *lxc_string_join(const char *sep, const char **parts, bool use_as_prefix); extern int lxc_count_file_lines(const char *fn); extern bool dir_exists(const char *path); extern int safe_mount(const char *src, const char *dest, const char *fstype, unsigned long flags, const void *data, const char *rootfs); extern FILE *fopen_cloexec(const char *path, const char *mode); extern void append_line(char **dest, size_t oldlen, char *new, size_t newlen); extern char *read_file(const char *fnam); extern char *readat_file(int fd, const char *path); extern char *read_file_strip_newline(const char *fnam); extern char *cg_unified_get_current_cgroup(pid_t pid); extern char *cg_hybrid_get_current_cgroup(char *basecginfo, const char *controller, int type); extern char *cg_legacy_get_current_cgroup(pid_t pid, const char *controller); extern bool mkdir_p(const char *dir, mode_t mode); extern bool is_cgroup_fd(int fd); extern bool is_cgroup2_fd(int fd); static inline int openat_safe(int fd, const char *path) { return openat(fd, path, O_DIRECTORY | O_RDONLY | O_CLOEXEC | O_NOFOLLOW); } extern int cgroup_walkup_to_root(int cgroup2_root_fd, int hierarchy_fd, const char *cgroup, const char *file, char **value); #define must_make_path_relative(__first__, ...) \ ({ \ char *__ptr__; \ if (*__first__ == '/') \ __ptr__ = must_make_path(".", __first__, __VA_ARGS__); \ else \ __ptr__ = must_make_path(__first__, __VA_ARGS__); \ __ptr__; \ }) static inline bool is_empty_string(const char *s) { return !s || strcmp(s, "") == 0; } #endif /* __LXC_CGROUP_UTILS_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cpuset_parse.c0000664000175000017500000000245514773561567016571 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include "cgroups/cgroup.h" #include "cgroups/cgroup_utils.h" #include "memory_utils.h" /* * Helper functions for cpuset_in-set */ static char *cpuset_nexttok(const char *c) { char *r; if (!strlen(c)) return NULL; r = strchr(c + 1, ','); return r ? (r + 1) : NULL; } static int cpuset_getrange(const char *c, int *a, int *b) { int ret; ret = sscanf(c, "%d-%d", a, b); return ret; } /* * cpusets are in format "1,2-3,4" * iow, comma-delimited ranges */ bool cpu_in_cpuset(int cpu, const char *cpuset) { if (!strlen(cpuset)) return false; for (const char *c = cpuset; c; c = cpuset_nexttok(c)) { int a, b, ret; ret = cpuset_getrange(c, &a, &b); if (ret == 1 && cpu == a) /* "1" or "1,6" */ return true; else if (ret == 2 && cpu >= a && cpu <= b) /* range match */ return true; } return false; } /* * get cpu number in cpuset */ int cpu_number_in_cpuset(const char *cpuset) { int cpu_number = 0; for (const char *c = cpuset; c; c = cpuset_nexttok(c)) { int a, b, ret; ret = cpuset_getrange(c, &a, &b); if (ret == 1) cpu_number++; else if (ret == 2) cpu_number += a > b ? a - b + 1 : b - a + 1; } return cpu_number; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/cpuset_parse.h0000664000175000017500000000071514773561567016573 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_CPUSET_PARSE_H #define __LXCFS_CPUSET_PARSE_H #include "config.h" #include #include #include #include #include #include #include "macro.h" extern bool cpu_in_cpuset(int cpu, const char *cpuset); extern int cpu_number_in_cpuset(const char *cpuset); extern char *get_cpuset(const char *cg); #endif /* __LXCFS_CPUSET_PARSE_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/lxcfs.c0000664000175000017500000010752014773561567015212 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "lxcfs_fuse.h" #include "bindings.h" #include "lxcfs_fuse_compat.h" #include "macro.h" #include "memory_utils.h" #include "utils.h" #define PID_FILE "/lxcfs.pid" void *dlopen_handle; static char runtime_path[PATH_MAX] = DEFAULT_RUNTIME_PATH; /* Functions to keep track of number of threads using the library */ static int users_count; static pthread_mutex_t user_count_mutex = PTHREAD_MUTEX_INITIALIZER; static void lock_mutex(pthread_mutex_t *l) { int ret; ret = pthread_mutex_lock(l); if (ret) log_exit("%s - returned: %d\n", strerror(ret), ret); } static void unlock_mutex(pthread_mutex_t *l) { int ret; ret = pthread_mutex_unlock(l); if (ret) log_exit("%s - returned: %d\n", strerror(ret), ret); } static inline void users_lock(void) { lock_mutex(&user_count_mutex); } static inline void users_unlock(void) { unlock_mutex(&user_count_mutex); } /* Returns file info type of custom type declaration carried * in fuse_file_info */ static inline enum lxcfs_virt_t file_info_type(struct fuse_file_info *fi) { struct file_info *f; f = INTTYPE_TO_PTR(fi->fh); if (!f) return -1; if (!LXCFS_TYPE_OK(f->type)) return -1; return f->type; } static pthread_t loadavg_pid = 0; /* Returns zero on success */ static int start_loadavg(void) { char *error; pthread_t (*__load_daemon)(int); int (*__load_daemon_v2)(pthread_t *, int); /* try a new load_daemon_v2() API */ dlerror(); __load_daemon_v2 = (int (*)(pthread_t *, int))dlsym(dlopen_handle, "load_daemon_v2"); error = dlerror(); if (error) /* try with an old symbol name */ goto old_api; lxcfs_debug("start_loadavg: using load_daemon_v2"); if (__load_daemon_v2(&loadavg_pid, 1)) { /* we have to NULLify loadavg_pid as in case of error it's contents are undefined */ loadavg_pid = 0; return log_error(-1, "Failed to start loadavg daemon"); } /* we are done */ return 0; old_api: /* go with an old load_daemon() API */ dlerror(); __load_daemon = (pthread_t(*)(int))dlsym(dlopen_handle, "load_daemon"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to start loadavg daemon", error); lxcfs_debug("start_loadavg: using load_daemon"); loadavg_pid = __load_daemon(1); if (!loadavg_pid) return -1; return 0; } /* Returns zero on success */ static int stop_loadavg(void) { char *error; int (*__stop_load_daemon)(pthread_t); __stop_load_daemon = (int (*)(pthread_t))dlsym(dlopen_handle, "stop_load_daemon"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to stop loadavg daemon", error); if (__stop_load_daemon(loadavg_pid)) return -1; return 0; } static volatile sig_atomic_t need_reload; static int do_lxcfs_fuse_init(void) { char *error; void *(*__lxcfs_fuse_init)(struct fuse_conn_info * conn, void * cfg); dlerror(); __lxcfs_fuse_init = (void *(*)(struct fuse_conn_info * conn, void * cfg))dlsym(dlopen_handle, "lxcfs_fuse_init"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find lxcfs_fuse_init()", error); __lxcfs_fuse_init(NULL, NULL); return 0; } /* do_reload - reload the dynamic library. Done under * lock and when we know the user_count was 0 */ static void do_reload(bool reinit) { int ret; char lxcfs_lib_path[PATH_MAX]; if (loadavg_pid > 0) stop_loadavg(); if (dlopen_handle) { lxcfs_info("Closed liblxcfs.so"); dlclose(dlopen_handle); } /* First try loading using ld.so */ #ifdef RESOLVE_NOW dlopen_handle = dlopen("liblxcfs.so", RTLD_NOW); #else dlopen_handle = dlopen("liblxcfs.so", RTLD_LAZY); #endif if (dlopen_handle) { lxcfs_debug("Opened liblxcfs.so"); goto good; } #ifdef LIBDIR /* LIBDIR: autoconf will setup this MACRO. Default value is $PREFIX/lib */ ret = snprintf(lxcfs_lib_path, sizeof(lxcfs_lib_path), "%s/lxcfs/liblxcfs.so", LIBDIR); #else ret = snprintf(lxcfs_lib_path, sizeof(lxcfs_lib_path), "/usr/local/lib/lxcfs/liblxcfs.so"); #endif if (ret < 0 || (size_t)ret >= sizeof(lxcfs_lib_path)) log_exit("Failed to create path to open liblxcfs"); dlopen_handle = dlopen(lxcfs_lib_path, RTLD_LAZY); if (!dlopen_handle) log_exit("%s - Failed to open liblxcfs.so at %s", dlerror(), lxcfs_lib_path); else lxcfs_debug("Opened %s", lxcfs_lib_path); good: if (reinit && do_lxcfs_fuse_init() < 0) { log_exit("Failed to initialize liblxcfs.so"); } if (loadavg_pid > 0) start_loadavg(); if (need_reload) lxcfs_info("Reloaded LXCFS"); need_reload = 0; } static void up_users(void) { users_lock(); if (users_count == 0 && need_reload) do_reload(true); users_count++; users_unlock(); } static void down_users(void) { users_lock(); users_count--; users_unlock(); } static void sigusr1_reload(int signo, siginfo_t *info, void *extra) { need_reload = 1; } /* Functions to run the library methods */ static int do_cg_getattr(const char *path, struct stat *sb) { char *error; int (*__cg_getattr)(const char *path, struct stat *sb); dlerror(); __cg_getattr = (int (*)(const char *, struct stat *))dlsym(dlopen_handle, "cg_getattr"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_getattr()", error); return __cg_getattr(path, sb); } static int do_proc_getattr(const char *path, struct stat *sb) { char *error; int (*__proc_getattr)(const char *path, struct stat *sb); dlerror(); __proc_getattr = (int (*)(const char *, struct stat *)) dlsym(dlopen_handle, "proc_getattr"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find proc_getattr()", error); return __proc_getattr(path, sb); } static int do_sys_getattr(const char *path, struct stat *sb) { char *error; int (*__sys_getattr)(const char *path, struct stat *sb); dlerror(); __sys_getattr = (int (*)(const char *, struct stat *)) dlsym(dlopen_handle, "sys_getattr"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_getattr()", error); return __sys_getattr(path, sb); } static int do_cg_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { char *error; int (*__cg_read)(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi); dlerror(); __cg_read = (int (*)(const char *, char *, size_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "cg_read"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_read()", error); return __cg_read(path, buf, size, offset, fi); } static int do_proc_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { char *error; int (*__proc_read)(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi); dlerror(); __proc_read = (int (*)(const char *, char *, size_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "proc_read"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find proc_read()", error); return __proc_read(path, buf, size, offset, fi); } static int do_sys_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { char *error; int (*__sys_read)(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi); dlerror(); __sys_read = (int (*)(const char *, char *, size_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "sys_read"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_read()", error); return __sys_read(path, buf, size, offset, fi); } static int do_cg_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { char *error; int (*__cg_write)(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi); dlerror(); __cg_write = (int (*)(const char *, const char *, size_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "cg_write"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_write()", error); return __cg_write(path, buf, size, offset, fi); } static int do_sys_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { char *error; int (*__sys_write)(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi); dlerror(); __sys_write = (int (*)(const char *, const char *, size_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "sys_write"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_write()", error); return __sys_write(path, buf, size, offset, fi); } static int do_cg_mkdir(const char *path, mode_t mode) { char *error; int (*__cg_mkdir)(const char *path, mode_t mode); dlerror(); __cg_mkdir = (int (*)(const char *, mode_t))dlsym(dlopen_handle, "cg_mkdir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_mkdir()", error); return __cg_mkdir(path, mode); } static int do_cg_chown(const char *path, uid_t uid, gid_t gid) { char *error; int (*__cg_chown)(const char *path, uid_t uid, gid_t gid); dlerror(); __cg_chown = (int (*)(const char *, uid_t, gid_t))dlsym(dlopen_handle, "cg_chown"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_chown()", error); return __cg_chown(path, uid, gid); } static int do_cg_rmdir(const char *path) { char *error; int (*__cg_rmdir)(const char *path); dlerror(); __cg_rmdir = (int (*)(const char *path))dlsym(dlopen_handle, "cg_rmdir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_rmdir()", error); return __cg_rmdir(path); } static int do_cg_chmod(const char *path, mode_t mode) { char *error; int (*__cg_chmod)(const char *path, mode_t mode); dlerror(); __cg_chmod = (int (*)(const char *, mode_t))dlsym(dlopen_handle, "cg_chmod"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_chmod()", error); return __cg_chmod(path, mode); } static int do_cg_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { char *error; int (*__cg_readdir)(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi); dlerror(); __cg_readdir = (int (*)(const char *, void *, fuse_fill_dir_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "cg_readdir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_readdir()", error); return __cg_readdir(path, buf, filler, offset, fi); } static int do_proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { char *error; int (*__proc_readdir)(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi); dlerror(); __proc_readdir = (int (*)(const char *, void *, fuse_fill_dir_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "proc_readdir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find proc_readdir()", error); return __proc_readdir(path, buf, filler, offset, fi); } static int do_sys_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { char *error; int (*__sys_readdir)(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi); dlerror(); __sys_readdir = (int (*)(const char *, void *, fuse_fill_dir_t, off_t, struct fuse_file_info *))dlsym(dlopen_handle, "sys_readdir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_readdir()", error); return __sys_readdir(path, buf, filler, offset, fi); } static int do_sys_readlink(const char *path, char *buf, size_t size) { char *error; int (*__sys_readlink)(const char *path, char *buf, size_t size); dlerror(); __sys_readlink = (int (*)(const char *, char *, size_t))dlsym(dlopen_handle, "sys_readlink"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_readlink()", error); return __sys_readlink(path, buf, size); } static int do_cg_open(const char *path, struct fuse_file_info *fi) { char *error; int (*__cg_open)(const char *path, struct fuse_file_info *fi); dlerror(); __cg_open = (int (*)(const char *, struct fuse_file_info *))dlsym(dlopen_handle, "cg_open"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_open()", error); return __cg_open(path, fi); } static int do_cg_access(const char *path, int mode) { char *error; int (*__cg_access)(const char *path, int mode); dlerror(); __cg_access = (int (*)(const char *, int mode))dlsym(dlopen_handle, "cg_access"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_access()", error); return __cg_access(path, mode); } static int do_proc_open(const char *path, struct fuse_file_info *fi) { char *error; int (*__proc_open)(const char *path, struct fuse_file_info *fi); dlerror(); __proc_open = (int (*)(const char *path, struct fuse_file_info *fi))dlsym(dlopen_handle, "proc_open"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find proc_open()", error); return __proc_open(path, fi); } static int do_proc_access(const char *path, int mode) { char *error; int (*__proc_access)(const char *path, int mode); dlerror(); __proc_access = (int (*)(const char *, int mode))dlsym(dlopen_handle, "proc_access"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find proc_access()", error); return __proc_access(path, mode); } static int do_sys_open(const char *path, struct fuse_file_info *fi) { char *error; int (*__sys_open)(const char *path, struct fuse_file_info *fi); dlerror(); __sys_open = (int (*)(const char *path, struct fuse_file_info *fi))dlsym(dlopen_handle, "sys_open"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_open()", error); return __sys_open(path, fi); } static int do_sys_opendir(const char *path, struct fuse_file_info *fi) { char *error; int (*__sys_opendir)(const char *path, struct fuse_file_info *fi); dlerror(); __sys_opendir = (int (*)(const char *path, struct fuse_file_info *fi))dlsym(dlopen_handle, "sys_opendir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_opendir()", error); return __sys_opendir(path, fi); } static int do_sys_access(const char *path, int mode) { char *error; int (*__sys_access)(const char *path, int mode); dlerror(); __sys_access = (int (*)(const char *, int mode))dlsym(dlopen_handle, "sys_access"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_access()", error); return __sys_access(path, mode); } static int do_cg_release(const char *path, struct fuse_file_info *fi) { char *error; int (*__cg_release)(const char *path, struct fuse_file_info *fi); dlerror(); __cg_release = (int (*)(const char *path, struct fuse_file_info *))dlsym(dlopen_handle, "cg_release"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_release()", error); return __cg_release(path, fi); } static int do_proc_release(const char *path, struct fuse_file_info *fi) { char *error; int (*__proc_release)(const char *path, struct fuse_file_info *fi); dlerror(); __proc_release = (int (*)(const char *path, struct fuse_file_info *)) dlsym(dlopen_handle, "proc_release"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find proc_release()", error); return __proc_release(path, fi); } static int do_sys_release(const char *path, struct fuse_file_info *fi) { char *error; int (*__sys_release)(const char *path, struct fuse_file_info *fi); dlerror(); __sys_release = (int (*)(const char *path, struct fuse_file_info *))dlsym(dlopen_handle, "sys_release"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_release()", error); return __sys_release(path, fi); } static int do_cg_opendir(const char *path, struct fuse_file_info *fi) { char *error; int (*__cg_opendir)(const char *path, struct fuse_file_info *fi); dlerror(); __cg_opendir = (int (*)(const char *path, struct fuse_file_info *fi))dlsym(dlopen_handle, "cg_opendir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_opendir()", error); return __cg_opendir(path, fi); } static int do_cg_releasedir(const char *path, struct fuse_file_info *fi) { char *error; int (*__cg_releasedir)(const char *path, struct fuse_file_info *fi); dlerror(); __cg_releasedir = (int (*)(const char *path, struct fuse_file_info *))dlsym(dlopen_handle, "cg_releasedir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find cg_releasedir()", error); return __cg_releasedir(path, fi); } static int do_sys_releasedir(const char *path, struct fuse_file_info *fi) { char *error; int (*__sys_releasedir)(const char *path, struct fuse_file_info *fi); dlerror(); __sys_releasedir = (int (*)(const char *path, struct fuse_file_info *))dlsym(dlopen_handle, "sys_releasedir"); error = dlerror(); if (error) return log_error(-1, "%s - Failed to find sys_releasedir()", error); return __sys_releasedir(path, fi); } static bool cgroup_is_enabled = false; #if HAVE_FUSE3 static int lxcfs_getattr(const char *path, struct stat *sb, struct fuse_file_info *fi) #else static int lxcfs_getattr(const char *path, struct stat *sb) #endif { int ret; struct timespec now; if (strcmp(path, "/") == 0) { if (clock_gettime(CLOCK_REALTIME, &now) < 0) return -EINVAL; sb->st_uid = sb->st_gid = 0; sb->st_atim = sb->st_mtim = sb->st_ctim = now; sb->st_size = 0; sb->st_mode = S_IFDIR | 00755; sb->st_nlink = 2; return 0; } if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_getattr(path, sb); down_users(); return ret; } if (strncmp(path, "/proc", 5) == 0) { up_users(); ret = do_proc_getattr(path, sb); down_users(); return ret; } if (strncmp(path, "/sys", 4) == 0) { up_users(); ret = do_sys_getattr(path, sb); down_users(); return ret; } return -ENOENT; } static int lxcfs_opendir(const char *path, struct fuse_file_info *fi) { int ret; if (strcmp(path, "/") == 0) return 0; if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_opendir(path, fi); down_users(); return ret; } if (strcmp(path, "/proc") == 0) return 0; if (strncmp(path, "/sys", 4) == 0) { up_users(); ret = do_sys_opendir(path, fi); down_users(); return ret; } return -ENOENT; } #if HAVE_FUSE3 static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi, enum fuse_readdir_flags flags) #else static int lxcfs_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) #endif { int ret; enum lxcfs_virt_t type; type = file_info_type(fi); if (strcmp(path, "/") == 0) { if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dir_filler(filler, buf, "proc", 0) != 0 || dir_filler(filler, buf, "sys", 0) != 0 || (cgroup_is_enabled && dir_filler(filler, buf, "cgroup", 0) != 0)) return -ENOMEM; return 0; } if (cgroup_is_enabled && LXCFS_TYPE_CGROUP(type)) { up_users(); ret = do_cg_readdir(path, buf, filler, offset, fi); down_users(); return ret; } if (strcmp(path, "/proc") == 0) { up_users(); ret = do_proc_readdir(path, buf, filler, offset, fi); down_users(); return ret; } if (LXCFS_TYPE_SYS(type)) { up_users(); ret = do_sys_readdir(path, buf, filler, offset, fi); down_users(); return ret; } return -ENOENT; } static int lxcfs_access(const char *path, int mode) { int ret; if (strcmp(path, "/") == 0 && (mode & W_OK) == 0) return 0; if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_access(path, mode); down_users(); return ret; } if (strncmp(path, "/proc", 5) == 0) { up_users(); ret = do_proc_access(path, mode); down_users(); return ret; } if (strncmp(path, "/sys", 4) == 0) { up_users(); ret = do_sys_access(path, mode); down_users(); return ret; } return -EACCES; } static int lxcfs_releasedir(const char *path, struct fuse_file_info *fi) { int ret; enum lxcfs_virt_t type; type = file_info_type(fi); if (LXCFS_TYPE_CGROUP(type)) { up_users(); ret = do_cg_releasedir(path, fi); down_users(); return ret; } if (LXCFS_TYPE_SYS(type)) { up_users(); ret = do_sys_releasedir(path, fi); down_users(); return ret; } if (path) { if (strcmp(path, "/") == 0) return 0; if (strcmp(path, "/proc") == 0) return 0; } lxcfs_error("unknown file type: path=%s, type=%d, fi->fh=%" PRIu64, path, type, fi->fh); return -EINVAL; } static int lxcfs_open(const char *path, struct fuse_file_info *fi) { int ret; if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_open(path, fi); down_users(); return ret; } if (strncmp(path, "/proc", 5) == 0) { up_users(); ret = do_proc_open(path, fi); down_users(); return ret; } if (strncmp(path, "/sys", 4) == 0) { up_users(); ret = do_sys_open(path, fi); down_users(); return ret; } return -EACCES; } static int lxcfs_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { int ret; enum lxcfs_virt_t type; type = file_info_type(fi); if (cgroup_is_enabled && LXCFS_TYPE_CGROUP(type)) { up_users(); ret = do_cg_read(path, buf, size, offset, fi); down_users(); return ret; } if (LXCFS_TYPE_PROC(type)) { up_users(); ret = do_proc_read(path, buf, size, offset, fi); down_users(); return ret; } if (LXCFS_TYPE_SYS(type)) { up_users(); ret = do_sys_read(path, buf, size, offset, fi); down_users(); return ret; } lxcfs_error("unknown file type: path=%s, type=%d, fi->fh=%" PRIu64, path, type, fi->fh); return -EINVAL; } int lxcfs_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { int ret; enum lxcfs_virt_t type; type = file_info_type(fi); if (cgroup_is_enabled && LXCFS_TYPE_CGROUP(type)) { up_users(); ret = do_cg_write(path, buf, size, offset, fi); down_users(); return ret; } if (LXCFS_TYPE_SYS(type)) { up_users(); ret = do_sys_write(path, buf, size, offset, fi); down_users(); return ret; } return -EINVAL; } int lxcfs_readlink(const char *path, char *buf, size_t size) { int ret; if (strncmp(path, "/sys", 4) == 0) { up_users(); ret = do_sys_readlink(path, buf, size); down_users(); return ret; } return -EINVAL; } static int lxcfs_flush(const char *path, struct fuse_file_info *fi) { return 0; } static int lxcfs_release(const char *path, struct fuse_file_info *fi) { int ret; enum lxcfs_virt_t type; type = file_info_type(fi); if (LXCFS_TYPE_CGROUP(type)) { up_users(); ret = do_cg_release(path, fi); down_users(); return ret; } if (LXCFS_TYPE_PROC(type)) { up_users(); ret = do_proc_release(path, fi); down_users(); return ret; } if (LXCFS_TYPE_SYS(type)) { up_users(); ret = do_sys_release(path, fi); down_users(); return ret; } lxcfs_error("unknown file type: path=%s, type=%d, fi->fh=%" PRIu64, path, type, fi->fh); return -EINVAL; } static int lxcfs_fsync(const char *path, int datasync, struct fuse_file_info *fi) { return 0; } int lxcfs_mkdir(const char *path, mode_t mode) { int ret; if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_mkdir(path, mode); down_users(); return ret; } return -EPERM; } #if HAVE_FUSE3 int lxcfs_chown(const char *path, uid_t uid, gid_t gid, struct fuse_file_info *fi) #else int lxcfs_chown(const char *path, uid_t uid, gid_t gid) #endif { int ret; if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_chown(path, uid, gid); down_users(); return ret; } if (strncmp(path, "/proc", 5) == 0) return -EPERM; if (strncmp(path, "/sys", 4) == 0) return -EPERM; return -ENOENT; } /* * cat first does a truncate before doing ops->write. This doesn't * really make sense for cgroups. So just return 0 always but do * nothing. */ #if HAVE_FUSE3 int lxcfs_truncate(const char *path, off_t newsize, struct fuse_file_info *fi) #else int lxcfs_truncate(const char *path, off_t newsize) #endif { if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) return 0; if (strncmp(path, "/sys", 4) == 0) return 0; return -EPERM; } int lxcfs_rmdir(const char *path) { int ret; if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_rmdir(path); down_users(); return ret; } return -EPERM; } #if HAVE_FUSE3 int lxcfs_chmod(const char *path, mode_t mode, struct fuse_file_info *fi) #else int lxcfs_chmod(const char *path, mode_t mode) #endif { int ret; if (cgroup_is_enabled && strncmp(path, "/cgroup", 7) == 0) { up_users(); ret = do_cg_chmod(path, mode); down_users(); return ret; } if (strncmp(path, "/proc", 5) == 0) return -EPERM; if (strncmp(path, "/sys", 4) == 0) return -EPERM; return -ENOENT; } #if HAVE_FUSE3 static void fuse_intr_sighandler(int sig) { (void) sig; /* Nothing to do */ } static int fuse_init_intr_signal(int signum) { struct sigaction old_sa; struct sigaction sa; if (sigaction(signum, NULL, &old_sa) == -1) return log_error(-1, "cannot get old signal handler\n"); if (old_sa.sa_handler != SIG_DFL) return log_error(-1, "%d has non-default handler\n", signum); memset(&sa, 0, sizeof(struct sigaction)); /* * We *must* enable SA_RESTART, otherwise we may accidentally * break some code which is not ready to signals/fuse interrupt. */ sa.sa_flags = SA_RESTART; sa.sa_handler = fuse_intr_sighandler; sigemptyset(&sa.sa_mask); if (sigaction(signum, &sa, NULL) == -1) return log_error(-1, "cannot set interrupt signal handler\n"); return 0; } #endif #if HAVE_FUSE3 static void *lxcfs_init(struct fuse_conn_info *conn, struct fuse_config *cfg) #else static void *lxcfs_init(struct fuse_conn_info *conn) #endif { if (do_lxcfs_fuse_init() < 0) return NULL; #if HAVE_FUSE3 cfg->direct_io = 1; cfg->intr = 1; cfg->intr_signal = LXCFS_INTR_SIGNAL; #endif return fuse_get_context()->private_data; } const struct fuse_operations lxcfs_ops = { .access = lxcfs_access, .chmod = lxcfs_chmod, .chown = lxcfs_chown, .flush = lxcfs_flush, .fsync = lxcfs_fsync, .getattr = lxcfs_getattr, .init = lxcfs_init, .mkdir = lxcfs_mkdir, .open = lxcfs_open, .opendir = lxcfs_opendir, .read = lxcfs_read, .readdir = lxcfs_readdir, .release = lxcfs_release, .releasedir = lxcfs_releasedir, .rmdir = lxcfs_rmdir, .truncate = lxcfs_truncate, .write = lxcfs_write, .readlink = lxcfs_readlink, .create = NULL, .destroy = NULL, #if !HAVE_FUSE3 .fgetattr = NULL, #endif .fsyncdir = NULL, #if !HAVE_FUSE3 .ftruncate = NULL, .getdir = NULL, #endif .getxattr = NULL, .link = NULL, .listxattr = NULL, .mknod = NULL, .rename = NULL, .removexattr = NULL, .setxattr = NULL, .statfs = NULL, .symlink = NULL, .unlink = NULL, #if !HAVE_FUSE3 .utime = NULL, #endif }; static void usage(void) { lxcfs_info("Usage: lxcfs \n"); lxcfs_info("lxcfs is a FUSE-based proc, sys and cgroup virtualizing filesystem\n"); lxcfs_info("Options :"); lxcfs_info(" -d, --debug Run lxcfs with debugging enabled"); lxcfs_info(" -f, --foreground Run lxcfs in the foreground"); lxcfs_info(" -h, --help Print help"); lxcfs_info(" -l, --enable-loadavg Enable loadavg virtualization"); lxcfs_info(" -o Options to pass directly through fuse"); lxcfs_info(" -p, --pidfile=FILE Path to use for storing lxcfs pid"); lxcfs_info(" Default pidfile is %s/lxcfs.pid", DEFAULT_RUNTIME_PATH); lxcfs_info(" -u, --disable-swap Disable swap virtualization"); lxcfs_info(" -v, --version Print lxcfs version"); lxcfs_info(" --enable-cfs Enable CPU virtualization via CPU shares"); lxcfs_info(" --enable-pidfd Use pidfd for process tracking"); lxcfs_info(" --enable-cgroup Enable cgroup emulation code"); lxcfs_info(" --runtime-dir=DIR Path to use as the runtime directory."); lxcfs_info(" Default is %s", DEFAULT_RUNTIME_PATH); exit(EXIT_FAILURE); } static int set_pidfile(char *pidfile) { __do_close int fd = -EBADF; char buf[INTTYPE_TO_STRLEN(long)]; int ret; struct flock fl = { .l_type = F_WRLCK, .l_whence = SEEK_SET, .l_start = 0, .l_len = 0, }; fd = open(pidfile, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | O_CLOEXEC); if (fd < 0) return log_error(-1, "Could not open pidfile %s: %m", pidfile); if (fcntl(fd, F_SETLK, &fl) < 0) { if (errno == EAGAIN || errno == EACCES) return log_error(-1, "PID file '%s' is already locked", pidfile); lxcfs_error("Warning; unable to lock PID file, proceeding"); } if (ftruncate(fd, 0)) return log_error(-1, "Error truncating PID file '%s': %m", pidfile); ret = snprintf(buf, sizeof(buf), "%ld\n", (long)getpid()); if (ret < 0 || (size_t)ret >= sizeof(buf)) return log_error(-1, "Failed to convert pid to string %m"); if (write(fd, buf, ret) != ret) return log_error(-1, "Error writing to PID file '%s': %m", pidfile); return move_fd(fd); } static const struct option long_options[] = { {"debug", no_argument, 0, 'd' }, {"disable-swap", no_argument, 0, 'u' }, {"enable-loadavg", no_argument, 0, 'l' }, {"foreground", no_argument, 0, 'f' }, {"help", no_argument, 0, 'h' }, {"version", no_argument, 0, 'v' }, {"enable-cfs", no_argument, 0, 0 }, {"enable-pidfd", no_argument, 0, 0 }, {"enable-cgroup", no_argument, 0, 0 }, {"pidfile", required_argument, 0, 'p' }, {"runtime-dir", required_argument, 0, 0 }, { }, }; static int append_comma_separate(char **s, const char *append) { int ret; char *news; size_t append_len, len; if (!append) return 0; append_len = strlen(append); if (!append_len) return 0; if (*s) { len = strlen(*s); news = realloc(*s, len + append_len + 2); } else { len = 0; news = realloc(NULL, append_len + 1); } if (!news) return -ENOMEM; if (*s) ret = snprintf(news + len, append_len + 2, ",%s", append); else ret = snprintf(news, append_len + 1, "%s", append); if (ret < 0) return -EIO; *s = news; return 0; } int main(int argc, char *argv[]) { int pidfile_fd = -EBADF; int ret = EXIT_FAILURE; char *pidfile = NULL, *token = NULL; char pidfile_buf[PATH_MAX + sizeof(PID_FILE)] = {}; bool debug = false, foreground = false; #if !HAVE_FUSE3 bool nonempty = false; #endif bool load_use = false; /* * what we pass to fuse_main is: * argv[0] -s [-f|-d] -o allow_other,directio argv[1] NULL */ int fuse_argc = 0; int c, idx, new_argc; char *fuse_argv[7]; const char *fuse_opts = NULL; char *new_fuse_opts = NULL; char *const *new_argv; struct lxcfs_opts *opts; char *runtime_path_arg = NULL; opts = malloc(sizeof(struct lxcfs_opts)); if (opts == NULL) { lxcfs_error("Error allocating memory for options"); goto out; } opts->swap_off = false; opts->use_pidfd = false; opts->use_cfs = false; opts->version = 2; while ((c = getopt_long(argc, argv, "dulfhvso:p:", long_options, &idx)) != -1) { switch (c) { case 0: if (strcmp(long_options[idx].name, "enable-pidfd") == 0) opts->use_pidfd = true; else if (strcmp(long_options[idx].name, "enable-cfs") == 0) opts->use_cfs = true; else if (strcmp(long_options[idx].name, "enable-cgroup") == 0) cgroup_is_enabled = true; else if (strcmp(long_options[idx].name, "runtime-dir") == 0) runtime_path_arg = optarg; else usage(); break; case 'd': debug = true; break; case 'f': foreground = true; break; case 'l': load_use = true; break; case 'o': if (fuse_opts) { lxcfs_error("Specifying -o multiple times is unsupported"); usage(); } fuse_opts = optarg; break; case 'p': pidfile = optarg; break; case 's': /* legacy argument: ignore */ break; case 'u': opts->swap_off = true; break; case 'v': lxcfs_info("%s", STRINGIFY(PROJECT_VERSION)); exit(EXIT_SUCCESS); default: usage(); } } if (foreground && debug) log_exit("Both --debug and --forgreound specified"); new_argv = &argv[optind]; new_argc = argc - optind; /* Older LXCFS versions printed help when used without any argument. */ if (new_argc == 0) usage(); if (new_argc != 1) { lxcfs_error("Missing mountpoint"); goto out; } if (runtime_path_arg) { strlcpy(runtime_path, runtime_path_arg, sizeof(runtime_path)); lxcfs_info("runtime path set to %s", runtime_path); } strlcpy(opts->runtime_path, runtime_path, sizeof(opts->runtime_path)); fuse_argv[fuse_argc++] = argv[0]; if (debug) fuse_argv[fuse_argc++] = "-d"; else fuse_argv[fuse_argc++] = "-f"; fuse_argv[fuse_argc++] = "-o"; /* Parse additional fuse options. */ if (fuse_opts) { char *dup; dup = strdup(fuse_opts); if (!dup) { lxcfs_error("Failed to copy fuse options"); goto out; } lxc_iterate_parts(token, dup, ",") { /* default */ if (strcmp(token, "allow_other") == 0) continue; /* default for LXCFS */ if (strcmp(token, "direct_io") == 0) continue; /* default for LXCFS */ if (strncmp(token, "entry_timeout", STRLITERALLEN("entry_timeout")) == 0) continue; /* default for LXCFS */ if (strncmp(token, "attr_timeout", STRLITERALLEN("entry_timeout")) == 0) continue; /* default for LXCFS */ if (strncmp(token, "allow_other", STRLITERALLEN("allow_other")) == 0) continue; /* default with fuse3 */ if (strcmp(token, "nonempty") == 0) { #if !HAVE_FUSE3 nonempty = true; #endif continue; } if (append_comma_separate(&new_fuse_opts, token)) { lxcfs_error("Failed to copy fuse argument \"%s\"", token); free(dup); goto out; } } free(dup); } if (append_comma_separate(&new_fuse_opts, "allow_other,entry_timeout=0.5,attr_timeout=0.5")) { lxcfs_error("Failed to copy fuse argument \"allow_other,entry_timeout=0.5,attr_timeout=0.5\""); goto out; } #if !HAVE_FUSE3 if (nonempty) { if (append_comma_separate(&new_fuse_opts, "nonempty")) { lxcfs_error("Failed to copy fuse argument \"nonempty\""); goto out; } } if (append_comma_separate(&new_fuse_opts, "direct_io")) { lxcfs_error("Failed to copy fuse argument \"direct_io\""); goto out; } #endif /* * We can't use default_permissions since we still support systems that * don't have kernels with cgroup namespace support. On such kernels * lxcfs will provide a namespaced cgroup view and needs explicit * access helpers to make that work. * Another reason that came to me is that we can't or at least * shouldn't guarantee that we don't need more complicated access * helpers for proc and sys virtualization in the future. */ fuse_argv[fuse_argc++] = new_fuse_opts; fuse_argv[fuse_argc++] = new_argv[0]; fuse_argv[fuse_argc] = NULL; lxcfs_info("Starting LXCFS at %s", argv[0]); do_reload(false); if (install_signal_handler(SIGUSR1, sigusr1_reload)) { lxcfs_error("%s - Failed to install SIGUSR1 signal handler", strerror(errno)); goto out; } #if HAVE_FUSE3 if (fuse_init_intr_signal(LXCFS_INTR_SIGNAL)) { lxcfs_error("Failed to install fuse interrupt signal handler"); goto out; } #endif if (!pidfile) { snprintf(pidfile_buf, sizeof(pidfile_buf), "%s%s", runtime_path, PID_FILE); pidfile = pidfile_buf; } pidfile_fd = set_pidfile(pidfile); if (pidfile_fd < 0) goto out; if (load_use && start_loadavg() != 0) goto out; if (!fuse_main(fuse_argc, fuse_argv, &lxcfs_ops, opts)) ret = EXIT_SUCCESS; if (load_use) stop_loadavg(); out: if (dlopen_handle) dlclose(dlopen_handle); if (pidfile) unlink(pidfile); free(new_fuse_opts); free(opts); close_prot_errno_disarm(pidfile_fd); exit(ret); } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/lxcfs_fuse.h0000664000175000017500000000036114773561567016234 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_FUSE_H #define __LXCFS_FUSE_H #include "config.h" #if HAVE_FUSE3 #include #else #include #endif #include "lxcfs_fuse_compat.h" #endif /* __LXCFS_FUSE_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/lxcfs_fuse_compat.h0000664000175000017500000000310714773561567017600 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_FUSE_COMPAT_H #define __LXCFS_FUSE_COMPAT_H #include "config.h" #include #include #include #include "memory_utils.h" #ifdef HAVE_FUSE3 #define DIR_FILLER(F,B,N,S,O) F(B,N,S,O,FUSE_FILL_DIR_PLUS) #else #define DIR_FILLER(F,B,N,S,O) F(B,N,S,O) #endif static inline int dir_filler(fuse_fill_dir_t filler, void *buf, const char *name, off_t off) { return DIR_FILLER(filler, buf, name, NULL, off); } static inline int dirent_filler(fuse_fill_dir_t filler, const char *path, const char *name, void *buf, off_t off) { __do_closedir DIR *dirp = NULL; struct stat st; dirp = opendir(path); if (dirp && !fstatat(dirfd(dirp), name, &st, AT_SYMLINK_NOFOLLOW)) return DIR_FILLER(filler, buf, name, &st, off); return DIR_FILLER(filler, buf, name, NULL, off); } static inline int dirent_fillerat(fuse_fill_dir_t filler, DIR *dp, struct dirent *dentry, void *buf, off_t off) { struct stat st; int ret; ret = fstatat(dirfd(dp), dentry->d_name, &st, AT_SYMLINK_NOFOLLOW); if (ret) { st = (struct stat){ .st_ino = dentry->d_ino, .st_mode = dentry->d_type << 12, }; } return DIR_FILLER(filler, buf, dentry->d_name, &st, off); } static inline int dir_fillerat(fuse_fill_dir_t filler, DIR *dp, const char *name, void *buf, off_t off) { struct stat st; int ret; ret = fstatat(dirfd(dp), name, &st, AT_SYMLINK_NOFOLLOW); if (!ret) return DIR_FILLER(filler, buf, name, &st, off); return DIR_FILLER(filler, buf, name, NULL, off); } #endif /* __LXCFS_FUSE_COMPAT_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/macro.h0000664000175000017500000001217114773561567015176 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_MACRO_H #define __LXCFS_MACRO_H #include "config.h" #include #include #include #define BATCH_SIZE 50 /* filesystem magic values */ #ifndef CGROUP_SUPER_MAGIC #define CGROUP_SUPER_MAGIC 0x27e0eb #endif #ifndef CGROUP2_SUPER_MAGIC #define CGROUP2_SUPER_MAGIC 0x63677270 #endif #define lxcfs_debug_stream(stream, format, ...) \ do { \ fprintf(stream, "%s: %d: %s: " format "\n", __FILE__, \ __LINE__, __func__, ##__VA_ARGS__); \ } while (false) #define lxcfs_error(format, ...) lxcfs_debug_stream(stderr, format, ##__VA_ARGS__) #ifdef DEBUG #define lxcfs_debug(format, ...) lxcfs_error(format, ##__VA_ARGS__) #else #define lxcfs_debug(format, ...) \ do { \ } while (false) #endif /* DEBUG */ #ifdef VERBOSE #define lxcfs_v(format, ...) lxcfs_error(format, ##__VA_ARGS__); #else #define lxcfs_v(format, ...) #endif /* VERBOSE */ #define lxcfs_info(format, ...) \ do { \ fprintf(stderr, format "\n", ##__VA_ARGS__); \ } while (false) #define log_error_errno(__ret__, __errno__, format, ...) \ ({ \ errno = __errno__; \ lxcfs_error(format, ##__VA_ARGS__); \ __ret__; \ }) #define log_error(__ret__, format, ...) \ ({ \ lxcfs_error(format, ##__VA_ARGS__); \ __ret__; \ }) #define STRLITERALLEN(x) (sizeof(""x"") - 1) /* Calculate the number of chars needed to represent a given integer as a C * string. Include room for '-' to indicate negative numbers and the \0 byte. * This is based on systemd. */ #define INTTYPE_TO_STRLEN(type) \ (2 + (sizeof(type) <= 1 \ ? 3 \ : sizeof(type) <= 2 \ ? 5 \ : sizeof(type) <= 4 \ ? 10 \ : sizeof(type) <= 8 \ ? 20 \ : sizeof(int[-2 * (sizeof(type) > 8)]))) #define strnprintf(buf, buf_size, ...) \ ({ \ int __ret_strnprintf; \ __ret_strnprintf = snprintf(buf, buf_size, ##__VA_ARGS__); \ if (__ret_strnprintf < 0 || (size_t)__ret_strnprintf >= (size_t)buf_size) \ __ret_strnprintf = ret_errno(EIO); \ __ret_strnprintf; \ }) #define move_ptr(ptr) \ ({ \ __typeof__(ptr) __internal_ptr__ = (ptr); \ (ptr) = NULL; \ __internal_ptr__; \ }) #define move_fd(fd) \ ({ \ int __internal_fd__ = (fd); \ (fd) = -EBADF; \ __internal_fd__; \ }) #define ret_errno(__errno__) \ ({ \ errno = __errno__; \ -__errno__; \ }) #define ret_set_errno(__ret__, __errno__) \ ({ \ errno = __errno__; \ __ret__; \ }) #define lxc_iterate_parts(__iterator, __splitme, __separators) \ for (char *__p = NULL, *__it = strtok_r(__splitme, __separators, &__p); \ (__iterator = __it); \ __iterator = __it = strtok_r(NULL, __separators, &__p)) #define log_exit(format, ...) \ ({ \ fprintf(stderr, format, ##__VA_ARGS__); \ _exit(EXIT_FAILURE); \ }) #ifdef DEBUG #define log_debug(__ret__, format, ...) \ ({ \ lxcfs_debug_stream(stderr, format, ##__VA_ARGS__); \ __ret__; \ }) #else #define log_debug(__ret__, format, ...) ({ __ret__; }) #endif #define PTR_TO_INT(p) ((int)((intptr_t)(p))) #define INT_TO_PTR(u) ((void *)((intptr_t)(u))) #define PTR_TO_UINT64(p) ((uint64_t)((uintptr_t)(p))) #define INTTYPE_TO_PTR(u) ((void *)((uintptr_t)(u))) #define __visible __attribute__((visibility("default"))) #define __lxcfs_fuse_ops #ifndef __returns_twice #define __returns_twice __attribute__((returns_twice)) #endif #define STRINGIFY(a) __STRINGIFY(a) #define __STRINGIFY(a) #a /* Taken over modified from the kernel sources. */ #define NBITS 32 /* bits in uint32_t */ #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d)) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, NBITS) static inline void set_bit(__u32 bit, __u32 *bitarr) { bitarr[bit / NBITS] |= ((__u32)1 << (bit % NBITS)); } static inline void clear_bit(__u32 bit, __u32 *bitarr) { bitarr[bit / NBITS] &= ~((__u32)1 << (bit % NBITS)); } static inline bool is_set(__u32 bit, __u32 *bitarr) { return (bitarr[bit / NBITS] & ((__u32)1 << (bit % NBITS))) != 0; } #endif /* __LXCFS_MACRO_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/memory_utils.h0000664000175000017500000000401614773561567016624 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_MEMORY_UTILS_H #define __LXCFS_MEMORY_UTILS_H #include "config.h" #include #include #include #include #include #include #include #include "macro.h" #define define_cleanup_function(type, cleaner) \ static inline void cleaner##_function(type *ptr) \ { \ if (*ptr) \ cleaner(*ptr); \ } #define call_cleaner(cleaner) __attribute__((__cleanup__(cleaner##_function))) #define close_prot_errno_disarm(fd) \ if (fd >= 0) { \ int _e_ = errno; \ close(fd); \ errno = _e_; \ fd = -EBADF; \ } #define close_prot_errno_replace(fd, new_fd) \ if (fd >= 0) { \ int _e_ = errno; \ close(fd); \ errno = _e_; \ fd = new_fd; \ } static inline void close_prot_errno_disarm_function(int *fd) { close_prot_errno_disarm(*fd); } #define __do_close call_cleaner(close_prot_errno_disarm) define_cleanup_function(FILE *, fclose); #define __do_fclose call_cleaner(fclose) define_cleanup_function(DIR *, closedir); #define __do_closedir call_cleaner(closedir) #define free_disarm(ptr) \ ({ \ free(ptr); \ ptr = NULL; \ }) static inline void free_disarm_function(void *ptr) { free_disarm(*(void **)ptr); } #define __do_free call_cleaner(free_disarm) static inline void free_string_list(char **list) { if (list) { for (int i = 0; list[i]; i++) free(list[i]); free_disarm(list); } } define_cleanup_function(char **, free_string_list); #define __do_free_string_list call_cleaner(free_string_list) static inline void *memdup(const void *data, size_t len) { void *copy = NULL; copy = len ? malloc(len) : NULL; return copy ? memcpy(copy, data, len) : NULL; } #define zalloc(__size__) (calloc(1, __size__)) #endif /* __LXCFS_MEMORY_UTILS_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/proc_cpuview.c0000664000175000017500000007677214773561567016616 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "proc_cpuview.h" #include "bindings.h" #include "cgroup_fuse.h" #include "cpuset_parse.h" #include "cgroups/cgroup.h" #include "cgroups/cgroup_utils.h" #include "memory_utils.h" #include "proc_loadavg.h" #include "utils.h" /* Data for CPU view */ struct cg_proc_stat { char *cg; struct cpuacct_usage *usage; /* Real usage as read from the host's /proc/stat. */ struct cpuacct_usage *view; /* Usage stats reported to the container. */ int cpu_count; pthread_mutex_t lock; /* For node manipulation. */ struct cg_proc_stat *next; }; struct cg_proc_stat_head { struct cg_proc_stat *next; time_t lastcheck; /* * For access to the list. Reading can be parallel, pruning is exclusive. */ pthread_rwlock_t lock; }; #define CPUVIEW_HASH_SIZE 100 static struct cg_proc_stat_head *proc_stat_history[CPUVIEW_HASH_SIZE]; static void reset_proc_stat_node(struct cg_proc_stat *node, struct cpuacct_usage *usage, int cpu_count) { lxcfs_debug("Resetting stat node for %s\n", node->cg); memcpy(node->usage, usage, sizeof(struct cpuacct_usage) * cpu_count); for (int i = 0; i < cpu_count; i++) { node->view[i].user = 0; node->view[i].system = 0; node->view[i].idle = 0; } node->cpu_count = cpu_count; } static bool expand_proc_stat_node(struct cg_proc_stat *node, int cpu_count) { __do_free struct cpuacct_usage *new_usage = NULL, *new_view = NULL; /* Allocate new memory */ new_usage = zalloc(sizeof(struct cpuacct_usage) * cpu_count); if (!new_usage) return false; new_view = zalloc(sizeof(struct cpuacct_usage) * cpu_count); if (!new_view) return false; /* Copy existing data & initialize new elements */ for (int i = 0; i < cpu_count; i++) { if (i < node->cpu_count) { new_usage[i].user = node->usage[i].user; new_usage[i].system = node->usage[i].system; new_usage[i].idle = node->usage[i].idle; new_view[i].user = node->view[i].user; new_view[i].system = node->view[i].system; new_view[i].idle = node->view[i].idle; } } free(node->usage); node->usage = move_ptr(new_usage); free(node->view); node->view = move_ptr(new_view); node->cpu_count = cpu_count; return true; } static void free_proc_stat_node(struct cg_proc_stat *node) { if (node) { /* * We're abusing the usage pointer to indicate that * pthread_mutex_init() was successful. Don't judge me. */ if (node->usage) pthread_mutex_destroy(&node->lock); free_disarm(node->cg); free_disarm(node->usage); free_disarm(node->view); free_disarm(node); } } define_cleanup_function(struct cg_proc_stat *, free_proc_stat_node); static struct cg_proc_stat *add_proc_stat_node(struct cg_proc_stat *new_node) { call_cleaner(free_proc_stat_node) struct cg_proc_stat *new = new_node; struct cg_proc_stat *rv = new_node; int hash = calc_hash(new->cg) % CPUVIEW_HASH_SIZE; struct cg_proc_stat_head *head = proc_stat_history[hash]; struct cg_proc_stat *cur; pthread_rwlock_wrlock(&head->lock); if (!head->next) { head->next = move_ptr(new); goto out_rwlock_unlock; } cur = head->next; for (;;) { /* * The node to be added is already present in the list, so * free the newly allocated one and return the one we found. */ if (strcmp(cur->cg, new->cg) == 0) { rv = cur; goto out_rwlock_unlock; } /* Keep walking. */ if (cur->next) { cur = cur->next; continue; } /* Add new node to end of list. */ cur->next = move_ptr(new); goto out_rwlock_unlock; } out_rwlock_unlock: pthread_mutex_lock(&rv->lock); pthread_rwlock_unlock(&head->lock); return move_ptr(rv); } static struct cg_proc_stat *new_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg) { call_cleaner(free_proc_stat_node) struct cg_proc_stat *node = NULL; __do_free struct cpuacct_usage *new_usage = NULL; node = zalloc(sizeof(struct cg_proc_stat)); if (!node) return NULL; node->cg = strdup(cg); if (!node->cg) return NULL; new_usage = memdup(usage, sizeof(struct cpuacct_usage) * cpu_count); if (!new_usage) return NULL; node->view = zalloc(sizeof(struct cpuacct_usage) * cpu_count); if (!node->view) return NULL; node->cpu_count = cpu_count; if (pthread_mutex_init(&node->lock, NULL)) return NULL; /* * We're abusing the usage pointer to indicate that * pthread_mutex_init() was successful. Don't judge me. */ node->usage = move_ptr(new_usage); return move_ptr(node); } static bool cgroup_supports(const char *controller, const char *cgroup, const char *file) { __do_free char *path = NULL; int cfd; cfd = get_cgroup_fd(controller); if (cfd < 0) return false; path = must_make_path_relative(cgroup, file, NULL); return faccessat(cfd, path, F_OK, 0) == 0; } /* should be called with wr-locked list */ static struct cg_proc_stat *prune_proc_stat_list(struct cg_proc_stat *node) { struct cg_proc_stat *first = NULL; for (struct cg_proc_stat *prev = NULL; node; ) { if (!cgroup_supports("cpu", node->cg, "cpu.shares")) { struct cg_proc_stat *cur = node; /* * We need to ensure that no one referenced this node, * because we are going to remove it from the list and free memory. * * If we can't grab the lock then just keep this node for now. */ if (pthread_mutex_trylock(&cur->lock)) goto next; /* * Yes, we can put lock back just after taking it, as we ensured * that we are only one user of it right now. * * It follows from three facts: * - we are under pthread_rwlock_wrlock(hash_table_bucket) * - pthread_mutex_lock is taken by find_proc_stat_node() * with pthread_rwlock_rdlock(hash_table_bucket) held. * - pthread_mutex_lock is taken by add_proc_stat_node() * with pthread_rwlock_wrlock(hash_table_bucket) held. * * It means that nobody can get a pointer to (cur) node in a parallel * thread and all old users of (cur) node have released pthread_mutex_lock(cur). */ pthread_mutex_unlock(&cur->lock); if (prev) prev->next = node->next; else first = node->next; node = node->next; lxcfs_debug("Removing stat node for %s\n", cur->cg); free_proc_stat_node(cur); } else { next: if (!first) first = node; prev = node; node = node->next; } } return first; } #define PROC_STAT_PRUNE_INTERVAL 10 static void prune_proc_stat_history(void) { time_t now = time(NULL); for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) { if (rwlock_wrlock_interruptible(&proc_stat_history[i]->lock)) continue; if ((proc_stat_history[i]->lastcheck + PROC_STAT_PRUNE_INTERVAL) > now) { pthread_rwlock_unlock(&proc_stat_history[i]->lock); return; } if (proc_stat_history[i]->next) { proc_stat_history[i]->next = prune_proc_stat_list(proc_stat_history[i]->next); proc_stat_history[i]->lastcheck = now; } pthread_rwlock_unlock(&proc_stat_history[i]->lock); } } static struct cg_proc_stat *find_proc_stat_node(struct cg_proc_stat_head *head, const char *cg) { struct cg_proc_stat *node; prune_proc_stat_history(); if (rwlock_rdlock_interruptible(&head->lock)) return NULL; if (!head->next) { pthread_rwlock_unlock(&head->lock); return NULL; } node = head->next; do { if (strcmp(cg, node->cg) == 0) { /* * If we are failed to take a lock OR * fuse request was interrupted then * just return NULL and exit gracefully. */ if (mutex_lock_interruptible(&node->lock)) node = NULL; goto out; } } while ((node = node->next)); node = NULL; out: pthread_rwlock_unlock(&head->lock); return node; } static struct cg_proc_stat *find_or_create_proc_stat_node(struct cpuacct_usage *usage, int cpu_count, const char *cg) { int hash = calc_hash(cg) % CPUVIEW_HASH_SIZE; struct cg_proc_stat_head *head = proc_stat_history[hash]; struct cg_proc_stat *node; node = find_proc_stat_node(head, cg); if (!node) { /* safe place to exit */ if (fuse_interrupted()) return NULL; node = new_proc_stat_node(usage, cpu_count, cg); if (!node) return NULL; node = add_proc_stat_node(node); lxcfs_debug("New stat node (%d) for %s\n", cpu_count, cg); } /* * If additional CPUs on the host have been enabled, CPU usage counter * arrays have to be expanded. */ if (node->cpu_count < cpu_count) { lxcfs_debug("Expanding stat node %d->%d for %s\n", node->cpu_count, cpu_count, cg); if (!expand_proc_stat_node(node, cpu_count)) { pthread_mutex_unlock(&node->lock); return log_debug(NULL, "Unable to expand stat node %d->%d for %s", node->cpu_count, cpu_count, cg); } } return node; } static void add_cpu_usage(uint64_t *surplus, struct cpuacct_usage *usage, uint64_t *counter, uint64_t threshold) { uint64_t free_space, to_add; free_space = threshold - usage->user - usage->system; if (free_space > usage->idle) free_space = usage->idle; if (free_space > *surplus) to_add = *surplus; else to_add = free_space; *counter += to_add; usage->idle -= to_add; *surplus -= to_add; } static uint64_t diff_cpu_usage(struct cpuacct_usage *older, struct cpuacct_usage *newer, struct cpuacct_usage *diff, int cpu_count) { uint64_t sum = 0; for (int i = 0; i < cpu_count; i++) { if (!newer[i].online) continue; /* * When cpuset is changed on the fly, the CPUs might get * reordered. We could either reset all counters, or check * that the substractions below will return expected results. */ if (newer[i].user > older[i].user) diff[i].user = newer[i].user - older[i].user; else diff[i].user = 0; if (newer[i].system > older[i].system) diff[i].system = newer[i].system - older[i].system; else diff[i].system = 0; if (newer[i].idle > older[i].idle) diff[i].idle = newer[i].idle - older[i].idle; else diff[i].idle = 0; sum += diff[i].user; sum += diff[i].system; sum += diff[i].idle; } return sum; } /* * Read cgroup CPU quota parameters from `cpu.cfs_quota_us` or * `cpu.cfs_period_us`, depending on `param`. Parameter value is returned * through `value`. */ static bool read_cpu_cfs_param(const char *cg, const char *param, int64_t *value) { __do_free char *str = NULL; char file[STRLITERALLEN("cpu.cfs_period_us") + 1]; bool first = true; int ret; if (pure_unified_layout(cgroup_ops)) { first = !strcmp(param, "quota"); ret = snprintf(file, sizeof(file), "cpu.max"); } else { ret = snprintf(file, sizeof(file), "cpu.cfs_%s_us", param); } if (ret < 0 || (size_t)ret >= sizeof(file)) return false; if (!cgroup_ops->get(cgroup_ops, "cpu", cg, file, &str)) return false; return sscanf(str, first ? "%" PRId64 : "%*d %" PRId64, value) == 1; } /* * Return the exact number of visible CPUs based on CPU quotas. * If there is no quota set, zero is returned. */ static double exact_cpu_count(const char *cg) { double rv; int nprocs; int64_t cfs_quota, cfs_period; if (!read_cpu_cfs_param(cg, "quota", &cfs_quota)) return 0; if (!read_cpu_cfs_param(cg, "period", &cfs_period)) return 0; if (cfs_quota <= 0 || cfs_period <= 0) return 0; rv = (double)cfs_quota / (double)cfs_period; nprocs = get_nprocs(); if (rv > nprocs) rv = nprocs; return rv; } /* * Return true if cfs quota of the cgroup is neg / not set */ static bool cfs_quota_disabled(const char *cg) { int64_t cfs_quota; if (!read_cpu_cfs_param(cg, "quota", &cfs_quota)) return true; return cfs_quota < 0; } /* * Return the maximum number of visible CPUs based on CPU quotas. * If there is no quota set, cpu number in cpuset value is returned. */ int max_cpu_count(const char *cpuset_cg, const char *cpu_cg) { __do_free char *cpuset = NULL; int rv, nprocs; int64_t cfs_quota, cfs_period; int nr_cpus_in_cpuset = 0; if (!read_cpu_cfs_param(cpu_cg, "quota", &cfs_quota)) cfs_quota = 0; if (!read_cpu_cfs_param(cpu_cg, "period", &cfs_period)) cfs_period = 0; cpuset = get_cpuset(cpuset_cg); if (cpuset) nr_cpus_in_cpuset = cpu_number_in_cpuset(cpuset); if (cfs_quota <= 0 || cfs_period <= 0) { if (nr_cpus_in_cpuset > 0) return nr_cpus_in_cpuset; return 0; } rv = cfs_quota / cfs_period; /* * In case quota/period does not yield a whole number, add one CPU for * the remainder. */ if ((cfs_quota % cfs_period) > 0) rv += 1; nprocs = get_nprocs(); if (rv > nprocs) rv = nprocs; /* Use min value in cpu quota and cpuset. */ if (nr_cpus_in_cpuset > 0 && nr_cpus_in_cpuset < rv) rv = nr_cpus_in_cpuset; return rv; } int cpuview_proc_stat(const char *cg, const char *cpu_cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size) { __do_free char *line = NULL; __do_free struct cpuacct_usage *diff = NULL; size_t linelen = 0, total_len = 0; int curcpu = -1; /* cpu numbering starts at 0 */ int physcpu, i; int cpu_cnt = 0; uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; uint64_t user_sum = 0, system_sum = 0, idle_sum = 0; uint64_t user_surplus = 0, system_surplus = 0; int nprocs, max_cpus; ssize_t l; uint64_t total_sum, threshold; struct cg_proc_stat *stat_node; nprocs = get_nprocs_conf(); if (cg_cpu_usage_size < nprocs) nprocs = cg_cpu_usage_size; /* Read all CPU stats and stop when we've encountered other lines */ while (getline(&line, &linelen, f) != -1) { int ret; char cpu_char[10]; /* That's a lot of cores */ uint64_t all_used, cg_used; if (strlen(line) == 0) continue; /* not a ^cpuN line containing a number N */ if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) break; if (sscanf(cpu_char, "%d", &physcpu) != 1) continue; if (physcpu >= cg_cpu_usage_size) continue; curcpu++; cpu_cnt++; if (!cpu_in_cpuset(physcpu, cpuset)) { for (i = curcpu; i <= physcpu; i++) cg_cpu_usage[i].online = false; continue; } if (curcpu < physcpu) { /* Some CPUs may be disabled */ for (i = curcpu; i < physcpu; i++) cg_cpu_usage[i].online = false; curcpu = physcpu; } cg_cpu_usage[curcpu].online = true; ret = sscanf(line, "%*s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 "lu", &user, &nice, &system, &idle, &iowait, &irq, &softirq, &steal, &guest, &guest_nice); if (ret != 10) continue; all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice; cg_used = cg_cpu_usage[curcpu].user + cg_cpu_usage[curcpu].system; if (all_used >= cg_used) { cg_cpu_usage[curcpu].idle = idle + (all_used - cg_used); } else { lxcfs_v("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time", curcpu, cg, all_used, cg_used); cg_cpu_usage[curcpu].idle = idle; } } /* Cannot use more CPUs than is available in cpuset. */ max_cpus = max_cpu_count(cg, cpu_cg); if (max_cpus > cpu_cnt || !max_cpus) max_cpus = cpu_cnt; /* takes lock pthread_mutex_lock(&node->lock) */ stat_node = find_or_create_proc_stat_node(cg_cpu_usage, nprocs, cg); if (!stat_node) return log_error(0, "Failed to find/create stat node for %s", cg); diff = zalloc(sizeof(struct cpuacct_usage) * nprocs); if (!diff) goto out_pthread_mutex_unlock; /* * If the new values are LOWER than values stored in memory, it means * the cgroup has been reset/recreated and we should reset too. */ for (curcpu = 0; curcpu < nprocs; curcpu++) { if (!cg_cpu_usage[curcpu].online) continue; if (cg_cpu_usage[curcpu].user < stat_node->usage[curcpu].user) reset_proc_stat_node(stat_node, cg_cpu_usage, nprocs); break; } total_sum = diff_cpu_usage(stat_node->usage, cg_cpu_usage, diff, nprocs); for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { stat_node->usage[curcpu].online = cg_cpu_usage[curcpu].online; if (!stat_node->usage[curcpu].online) continue; i++; stat_node->usage[curcpu].user += diff[curcpu].user; stat_node->usage[curcpu].system += diff[curcpu].system; stat_node->usage[curcpu].idle += diff[curcpu].idle; if (max_cpus > 0 && i >= max_cpus) { user_surplus += diff[curcpu].user; system_surplus += diff[curcpu].system; } } /* Calculate usage counters of visible CPUs */ if (max_cpus > 0) { uint64_t diff_user = 0; uint64_t diff_system = 0; uint64_t diff_idle = 0; uint64_t max_diff_idle = 0; uint64_t max_diff_idle_index = 0; double exact_cpus; /* threshold = maximum usage per cpu, including idle */ threshold = total_sum / cpu_cnt * max_cpus; for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { if (!stat_node->usage[curcpu].online) continue; i++; if (i == max_cpus) break; if (diff[curcpu].user + diff[curcpu].system >= threshold) continue; /* Add user */ add_cpu_usage(&user_surplus, &diff[curcpu], &diff[curcpu].user, threshold); if (diff[curcpu].user + diff[curcpu].system >= threshold) continue; /* If there is still room, add system */ add_cpu_usage(&system_surplus, &diff[curcpu], &diff[curcpu].system, threshold); } if (user_surplus > 0) lxcfs_debug("leftover user: %" PRIu64 "for %s\n", user_surplus, cg); if (system_surplus > 0) lxcfs_debug("leftover system: %" PRIu64 "for %s\n", system_surplus, cg); for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { if (!stat_node->usage[curcpu].online) continue; i++; if (i == max_cpus) break; stat_node->view[curcpu].user += diff[curcpu].user; stat_node->view[curcpu].system += diff[curcpu].system; stat_node->view[curcpu].idle += diff[curcpu].idle; diff_user += diff[curcpu].user; diff_system += diff[curcpu].system; diff_idle += diff[curcpu].idle; if (diff[curcpu].idle > max_diff_idle) { max_diff_idle = diff[curcpu].idle; max_diff_idle_index = curcpu; } lxcfs_v("curcpu: %d, diff_user: %" PRIu64 ", diff_system: %" PRIu64 ", diff_idle: %" PRIu64 "\n", curcpu, diff[curcpu].user, diff[curcpu].system, diff[curcpu].idle); } lxcfs_v("total. diff_user: %" PRIu64 ", diff_system: %" PRIu64 ", diff_idle: %" PRIu64 "\n", diff_user, diff_system, diff_idle); for (curcpu = 0; curcpu < nprocs; curcpu++) { user_sum += stat_node->view[curcpu].user; system_sum += stat_node->view[curcpu].system; idle_sum += stat_node->view[curcpu].idle; } /* revise cpu usage view to support partial cpu case. */ exact_cpus = exact_cpu_count(cg); /* skip revise cpu when cfs quota is disabled (exact_cpus == 0) */ if (!cfs_quota_disabled(cg) && exact_cpus < (double)max_cpus){ uint64_t delta = (uint64_t)((double)(diff_user + diff_system + diff_idle) * (1 - exact_cpus / (double)max_cpus)); lxcfs_v("revising cpu usage view to match the exact cpu count [%f]\n", exact_cpus); lxcfs_v("delta: %" PRIu64 "\n", delta); lxcfs_v("idle_sum before: %" PRIu64 "\n", idle_sum); if (idle_sum > delta) idle_sum = idle_sum - delta; else idle_sum = 0; lxcfs_v("idle_sum after: %l" PRIu64 "\n", idle_sum); curcpu = max_diff_idle_index; lxcfs_v("curcpu: %d, idle before: %" PRIu64 "\n", curcpu, stat_node->view[curcpu].idle); if (stat_node->view[curcpu].idle > delta) stat_node->view[curcpu].idle = stat_node->view[curcpu].idle - delta; else stat_node->view[curcpu].idle = 0; lxcfs_v("curcpu: %d, idle after: %" PRIu64 "\n", curcpu, stat_node->view[curcpu].idle); } } else { for (curcpu = 0; curcpu < nprocs; curcpu++) { if (!stat_node->usage[curcpu].online) continue; stat_node->view[curcpu].user = stat_node->usage[curcpu].user; stat_node->view[curcpu].system = stat_node->usage[curcpu].system; stat_node->view[curcpu].idle = stat_node->usage[curcpu].idle; user_sum += stat_node->view[curcpu].user; system_sum += stat_node->view[curcpu].system; idle_sum += stat_node->view[curcpu].idle; } } /* Render the file */ /* cpu-all */ l = snprintf(buf, buf_size, "cpu %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n", user_sum, system_sum, idle_sum); lxcfs_v("cpu-all: %s\n", buf); if (l < 0) { lxcfs_error("Failed to write cache"); total_len = 0; goto out_pthread_mutex_unlock; } if ((size_t)l >= buf_size) { lxcfs_error("Write to cache was truncated"); total_len = 0; goto out_pthread_mutex_unlock; } buf += l; buf_size -= l; total_len += l; /* Render visible CPUs Assume there are K CPUs: 0, 1, 2, ..., K-1. Among them, there are M online CPUs with index: a1, a2, ... aN ... aM (M >= N) N = max_cpus, M = number of online CPUs There will be N rendered cpus, indexed from 0 to N-1, cpu times of the cpus are calculated from those formula: - user_time[0] = stat_node->view[0].user + stat_node->view[1].user + ... + stat_node->view[a1].user - user_time[1] = stat_node->view[a1+1].user + stat_node->view[a1+1].user + ... + stat_node->view[a2].user ... - user_time[N-2] = stat_node->view[a(N-2)+1].user + stat_node->view[a(N-2)+2].user + ... + stat_node->view[a(N-1)].user - user_time[N-1] = stat_node->view[a(N-1)+1].user + stat_node->view[a(N-1)+2].user + ... + stat_node->view[aN] + ... + stat_node->view[K-1] (sum of all remaining CPUs) Similar formula applied for system and idle time */ uint64_t curcpu_view_user_sum = 0, curcpu_view_system_sum = 0, curcpu_view_idle_sum = 0; for (curcpu = 0, i = -1; curcpu < nprocs; curcpu++) { curcpu_view_user_sum += stat_node->view[curcpu].user; curcpu_view_system_sum += stat_node->view[curcpu].system; curcpu_view_idle_sum += stat_node->view[curcpu].idle; if (!stat_node->usage[curcpu].online && curcpu < nprocs - 1) { continue; } i++; if (max_cpus > 0 && i >= max_cpus) { // max(i) = count(rendered cpus) = max_cpus - 1 i--; } if (max_cpus > 0 && i == max_cpus - 1 && curcpu < nprocs - 1) { // last 'rendered' cpu, sum until reaches the last cpu continue; } l = snprintf(buf, buf_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n", i, curcpu_view_user_sum, curcpu_view_system_sum, curcpu_view_idle_sum); lxcfs_v("cpu: %s\n", buf); if (l < 0) { lxcfs_error("Failed to write cache"); total_len = 0; goto out_pthread_mutex_unlock; } if ((size_t)l >= buf_size) { lxcfs_error("Write to cache was truncated"); total_len = 0; goto out_pthread_mutex_unlock; } buf += l; buf_size -= l; total_len += l; curcpu_view_user_sum = 0; curcpu_view_system_sum = 0; curcpu_view_idle_sum = 0; } /* Pass the rest of /proc/stat, start with the last line read */ l = snprintf(buf, buf_size, "%s", line); if (l < 0) { lxcfs_error("Failed to write cache"); total_len = 0; goto out_pthread_mutex_unlock; } if ((size_t)l >= buf_size) { lxcfs_error("Write to cache was truncated"); total_len = 0; goto out_pthread_mutex_unlock; } buf += l; buf_size -= l; total_len += l; /* Pass the rest of the host's /proc/stat */ while (getline(&line, &linelen, f) != -1) { l = snprintf(buf, buf_size, "%s", line); if (l < 0) { lxcfs_error("Failed to write cache"); total_len = 0; goto out_pthread_mutex_unlock; } if ((size_t)l >= buf_size) { lxcfs_error("Write to cache was truncated"); total_len = 0; goto out_pthread_mutex_unlock; } buf += l; buf_size -= l; total_len += l; } out_pthread_mutex_unlock: if (stat_node) pthread_mutex_unlock(&stat_node->lock); return total_len; } /* * check whether this is a '^processor" line in /proc/cpuinfo */ static inline bool is_processor_line(const char *line) { int cpu; return sscanf(line, "processor : %d", &cpu) == 1; } static inline bool cpuline_in_cpuset(const char *line, const char *cpuset) { int cpu; if (sscanf(line, "processor : %d", &cpu) == 1) return cpu_in_cpuset(cpu, cpuset); return false; } int proc_cpuinfo_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cg = NULL, *cpuset = NULL, *line = NULL, *cpu_cg = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; struct fuse_context *fc = fuse_get_context(); struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data; struct file_info *d = INTTYPE_TO_PTR(fi->fh); size_t linelen = 0, total_len = 0; bool am_printing = false, firstline = true, is_s390x = false; int curcpu = -1, cpu, max_cpus = 0; bool use_view; char *cache = d->buf; size_t cache_size = d->buflen; if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cg = get_pid_cgroup(initpid, "cpuset"); if (!cg) return read_file_fuse("proc/cpuinfo", buf, size, d); prune_init_slice(cg); cpu_cg = get_pid_cgroup(initpid, "cpu"); if (!cpu_cg) return read_file_fuse("proc/cpuinfo", buf, size, d); prune_init_slice(cpu_cg); cpuset = get_cpuset(cg); if (!cpuset) return 0; if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) use_view = true; else use_view = false; if (use_view) max_cpus = max_cpu_count(cg, cpu_cg); f = fopen_cached("/proc/cpuinfo", "re", &fopen_cache); if (!f) return 0; while (getline(&line, &linelen, f) != -1) { ssize_t l; if (firstline) { firstline = false; if (strstr(line, "IBM/S390") != NULL) { is_s390x = true; am_printing = true; continue; } } if (strncmp(line, "# processors:", 12) == 0) continue; if (is_processor_line(line)) { if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus) break; am_printing = cpuline_in_cpuset(line, cpuset); if (am_printing) { curcpu++; l = snprintf(cache, cache_size, "processor : %d\n", curcpu); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; } continue; } else if (is_s390x && sscanf(line, "processor %d:", &cpu) == 1) { char *p; if (use_view && max_cpus > 0 && (curcpu + 1) == max_cpus) break; if (!cpu_in_cpuset(cpu, cpuset)) continue; curcpu ++; p = strchr(line, ':'); if (!p || !*p) return 0; p++; l = snprintf(cache, cache_size, "processor %d:%s", curcpu, p); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; continue; } if (am_printing) { l = snprintf(cache, cache_size, "%s", line); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; } } if (is_s390x) { __do_free char *origcache = d->buf; ssize_t l; d->buf = malloc(d->buflen); if (!d->buf) { d->buf = move_ptr(origcache); return 0; } cache = d->buf; cache_size = d->buflen; total_len = 0; l = snprintf(cache, cache_size, "vendor_id : IBM/S390\n"); if (l < 0 || (size_t)l >= cache_size) return 0; cache_size -= l; cache += l; total_len += l; l = snprintf(cache, cache_size, "# processors : %d\n", curcpu + 1); if (l < 0 || (size_t)l >= cache_size) return 0; cache_size -= l; cache += l; total_len += l; l = snprintf(cache, cache_size, "%s", origcache); if (l < 0 || (size_t)l >= cache_size) return 0; total_len += l; } d->cached = 1; d->size = total_len; if (total_len > size) total_len = size; /* read from off 0 */ memcpy(buf, d->buf, total_len); return total_len; } /* * Returns 0 on success. * It is the caller's responsibility to free `return_usage`, unless this * function returns an error. */ int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size) { __do_free char *usage_str = NULL; __do_free struct cpuacct_usage *cpu_usage = NULL; int i = 0, j = 0, read_pos = 0, read_cnt = 0; int cpucount; int ret; int cg_cpu; uint64_t cg_user, cg_system; int64_t ticks_per_sec; ticks_per_sec = sysconf(_SC_CLK_TCK); if (ticks_per_sec < 0 && errno == EINVAL) { lxcfs_debug("%m - Failed to determine number of ticks per second"); return -1; } cpucount = get_nprocs_conf(); cpu_usage = malloc(sizeof(struct cpuacct_usage) * cpucount); if (!cpu_usage) return -ENOMEM; memset(cpu_usage, 0, sizeof(struct cpuacct_usage) * cpucount); if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_all", &usage_str)) { char *sep = " \t\n"; char *tok; /* Read cpuacct.usage_percpu instead. */ lxcfs_debug("Falling back to cpuacct.usage_percpu"); if (!cgroup_ops->get(cgroup_ops, "cpuacct", cg, "cpuacct.usage_percpu", &usage_str)) return -1; lxc_iterate_parts(tok, usage_str, sep) { uint64_t percpu_user; if (i >= cpucount) break; tok = trim_whitespace_in_place(tok); ret = safe_uint64(tok, &percpu_user, 10); if (ret) return -1; /* Convert the time from nanoseconds to USER_HZ */ cpu_usage[i].user = percpu_user / 1000.0 / 1000 / 1000 * ticks_per_sec; cpu_usage[i].system = cpu_usage[i].user; i++; lxcfs_debug("cpu%d with time %s", i, tok); } } else { if (sscanf(usage_str, "cpu user system\n%n", &read_cnt) != 0) return log_error(-1, "read_cpuacct_usage_all reading first line from %s/cpuacct.usage_all failed", cg); read_pos += read_cnt; for (i = 0, j = 0; i < cpucount; i++) { ret = sscanf(usage_str + read_pos, "%d %" PRIu64 " %" PRIu64 "\n%n", &cg_cpu, &cg_user, &cg_system, &read_cnt); if (ret == EOF) break; if (ret != 3) return log_error(-EINVAL, "Failed to parse cpuacct.usage_all line %s from cgroup %s", usage_str + read_pos, cg); read_pos += read_cnt; /* Convert the time from nanoseconds to USER_HZ */ cpu_usage[j].user = cg_user / 1000.0 / 1000 / 1000 * ticks_per_sec; cpu_usage[j].system = cg_system / 1000.0 / 1000 / 1000 * ticks_per_sec; j++; } } *return_usage = move_ptr(cpu_usage); *size = cpucount; return 0; } static bool cpuview_init_head(struct cg_proc_stat_head **head) { __do_free struct cg_proc_stat_head *h; h = zalloc(sizeof(struct cg_proc_stat_head)); if (!h) return false; if (pthread_rwlock_init(&h->lock, NULL)) return false; h->lastcheck = time(NULL); *head = move_ptr(h); return true; } bool init_cpuview(void) { int i; for (i = 0; i < CPUVIEW_HASH_SIZE; i++) proc_stat_history[i] = NULL; for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { if (!cpuview_init_head(&proc_stat_history[i])) goto err; } return true; err: for (i = 0; i < CPUVIEW_HASH_SIZE; i++) { if (proc_stat_history[i]) free_disarm(proc_stat_history[i]); } return false; } static void cpuview_free_head(struct cg_proc_stat_head *head) { struct cg_proc_stat *node; if (head->next) { node = head->next; for (;;) { struct cg_proc_stat *cur = node; node = node->next; free_proc_stat_node(cur); if (!node) break; } } pthread_rwlock_destroy(&head->lock); free_disarm(head); } void free_cpuview(void) { for (int i = 0; i < CPUVIEW_HASH_SIZE; i++) if (proc_stat_history[i]) cpuview_free_head(proc_stat_history[i]); } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/proc_cpuview.h0000664000175000017500000000167114773561567016605 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_PROC_CPUVIEW_FUSE_H #define __LXCFS_PROC_CPUVIEW_FUSE_H #include "config.h" #include #include #include #include #include "lxcfs_fuse.h" #include "macro.h" struct cpuacct_usage { uint64_t user; uint64_t system; uint64_t idle; bool online; }; extern int cpuview_proc_stat(const char *cg, const char *cpu_cg, const char *cpuset, struct cpuacct_usage *cg_cpu_usage, int cg_cpu_usage_size, FILE *f, char *buf, size_t buf_size); extern int proc_cpuinfo_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi); extern int read_cpuacct_usage_all(char *cg, char *cpuset, struct cpuacct_usage **return_usage, int *size); extern bool init_cpuview(void); extern void free_cpuview(void); extern int max_cpu_count(const char *cpuset_cg, const char *cpu_cg); #endif /* __LXCFS_PROC_CPUVIEW_FUSE_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/proc_fuse.c0000664000175000017500000014420014773561567016054 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "proc_fuse.h" #include "bindings.h" #include "cgroup_fuse.h" #include "cgroups/cgroup.h" #include "cgroups/cgroup_utils.h" #include "cpuset_parse.h" #include "lxcfs_fuse_compat.h" #include "memory_utils.h" #include "proc_loadavg.h" #include "proc_cpuview.h" #include "utils.h" struct memory_stat { uint64_t hierarchical_memory_limit; uint64_t hierarchical_memsw_limit; uint64_t total_cache; uint64_t total_rss; uint64_t total_rss_huge; uint64_t total_shmem; uint64_t total_mapped_file; uint64_t total_dirty; uint64_t total_writeback; uint64_t total_swap; uint64_t total_pgpgin; uint64_t total_pgpgout; uint64_t total_pgfault; uint64_t total_pgmajfault; uint64_t total_inactive_anon; uint64_t total_active_anon; uint64_t total_inactive_file; uint64_t total_active_file; uint64_t total_unevictable; uint64_t slab; uint64_t slab_reclaimable; uint64_t slab_unreclaimable; }; static off_t get_procfile_size(const char *path) { __do_fclose FILE *f = NULL; __do_free char *line = NULL; size_t len = 0; ssize_t sz, answer = 0; f = fopen(path, "re"); if (!f) return 0; while ((sz = getline(&line, &len, f)) != -1) answer += sz; return answer; } static off_t get_procfile_size_with_personality(const char *path) { struct fuse_context *fc = fuse_get_context(); __u32 host_personality = liblxcfs_personality(), caller_personality; bool change_personality; int ret; off_t procfile_size_ret; if (get_task_personality(fc->pid, &caller_personality) < 0) return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid); /* do we need to change thread personality? */ change_personality = host_personality != caller_personality; if (change_personality) { ret = personality(caller_personality); if (ret == -1) return log_error(0, "Call to personality(%d) failed: %s\n", caller_personality, strerror(errno)); lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n", (int)syscall(SYS_gettid), ret, caller_personality); } procfile_size_ret = get_procfile_size(path); if (change_personality) { ret = personality(host_personality); if (ret == -1) return log_error(0, "Call to personality(%d) failed: %s\n", host_personality, strerror(errno)); lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n", (int)syscall(SYS_gettid), ret, host_personality); } return procfile_size_ret; } __lxcfs_fuse_ops int proc_getattr(const char *path, struct stat *sb) { struct timespec now; memset(sb, 0, sizeof(struct stat)); if (clock_gettime(CLOCK_REALTIME, &now) < 0) return -EINVAL; sb->st_uid = sb->st_gid = 0; sb->st_atim = sb->st_mtim = sb->st_ctim = now; if (strcmp(path, "/proc") == 0) { sb->st_mode = S_IFDIR | 00555; sb->st_nlink = 2; return 0; } if (strcmp(path, "/proc/meminfo") == 0 || strcmp(path, "/proc/cpuinfo") == 0 || strcmp(path, "/proc/uptime") == 0 || strcmp(path, "/proc/stat") == 0 || strcmp(path, "/proc/diskstats") == 0 || strcmp(path, "/proc/swaps") == 0 || strcmp(path, "/proc/loadavg") == 0 || strcmp(path, "/proc/slabinfo") == 0) { if (liblxcfs_functional()) { if (!can_access_personality()) return log_error(-EACCES, RESTRICTED_PERSONALITY_ACCESS_POLICY); sb->st_size = get_procfile_size_with_personality(path); } else sb->st_size = get_procfile_size(path); sb->st_mode = S_IFREG | 00444; sb->st_nlink = 1; return 0; } return -ENOENT; } __lxcfs_fuse_ops int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dir_filler(filler, buf, "cpuinfo", 0) != 0 || dir_filler(filler, buf, "meminfo", 0) != 0 || dir_filler(filler, buf, "stat", 0) != 0 || dir_filler(filler, buf, "uptime", 0) != 0 || dir_filler(filler, buf, "diskstats", 0) != 0 || dir_filler(filler, buf, "swaps", 0) != 0 || dir_filler(filler, buf, "loadavg", 0) != 0 || dir_filler(filler, buf, "slabinfo", 0) != 0) return -EINVAL; return 0; } __lxcfs_fuse_ops int proc_open(const char *path, struct fuse_file_info *fi) { __do_free struct file_info *info = NULL; int type = -1; if (strcmp(path, "/proc/meminfo") == 0) type = LXC_TYPE_PROC_MEMINFO; else if (strcmp(path, "/proc/cpuinfo") == 0) type = LXC_TYPE_PROC_CPUINFO; else if (strcmp(path, "/proc/uptime") == 0) type = LXC_TYPE_PROC_UPTIME; else if (strcmp(path, "/proc/stat") == 0) type = LXC_TYPE_PROC_STAT; else if (strcmp(path, "/proc/diskstats") == 0) type = LXC_TYPE_PROC_DISKSTATS; else if (strcmp(path, "/proc/swaps") == 0) type = LXC_TYPE_PROC_SWAPS; else if (strcmp(path, "/proc/loadavg") == 0) type = LXC_TYPE_PROC_LOADAVG; else if (strcmp(path, "/proc/slabinfo") == 0) type = LXC_TYPE_PROC_SLABINFO; if (type == -1) return -ENOENT; info = zalloc(sizeof(*info)); if (!info) return -ENOMEM; info->type = type; if (liblxcfs_functional()) { if (!can_access_personality()) return log_error(-EACCES, RESTRICTED_PERSONALITY_ACCESS_POLICY); info->buflen = get_procfile_size_with_personality(path) + BUF_RESERVE_SIZE; } else info->buflen = get_procfile_size(path) + BUF_RESERVE_SIZE; info->buf = zalloc(info->buflen); if (!info->buf) return -ENOMEM; /* set actual size to buffer size */ info->size = info->buflen; fi->fh = PTR_TO_UINT64(move_ptr(info)); return 0; } __lxcfs_fuse_ops int proc_access(const char *path, int mask) { if (strcmp(path, "/proc") == 0 && access(path, R_OK) == 0) return 0; /* these are all read-only */ if ((mask & ~R_OK) != 0) return -EACCES; return 0; } __lxcfs_fuse_ops int proc_release(const char *path, struct fuse_file_info *fi) { do_release_file_info(fi); return 0; } /** * Gets a non-hierarchical memory controller limit, or UINT64_MAX if no limit is * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise * reads the memory (RAM) limits. * * @returns 0 on success (and sets `*limit`), < 0 on error */ static int get_memlimit(const char *cgroup, bool swap, uint64_t *limit) { __do_free char *memlimit_str = NULL; uint64_t memlimit = UINT64_MAX; int ret; if (swap) ret = cgroup_ops->get_memory_swap_max(cgroup_ops, cgroup, &memlimit_str); else ret = cgroup_ops->get_memory_max(cgroup_ops, cgroup, &memlimit_str); if (ret < 0) return ret; if (memlimit_str[0]) { ret = safe_uint64(memlimit_str, &memlimit, 10); if (ret < 0) { lxcfs_error("Failed to convert memory%s.max=%s for cgroup %s", swap ? ".swap" : "", memlimit_str, cgroup); return ret; } } *limit = memlimit; return 0; } /* * This function taken from glibc-2.32, as POSIX dirname("/some-dir") will * return "/some-dir" as opposed to "/", which breaks `get_min_memlimit()` */ static char *gnu_dirname(char *path) { static const char dot[] = "."; char *last_slash; /* Find last '/'. */ last_slash = path != NULL ? strrchr(path, '/') : NULL; if (last_slash != NULL && last_slash != path && last_slash[1] == '\0') { /* Determine whether all remaining characters are slashes. */ char *runp; for (runp = last_slash; runp != path; --runp) if (runp[-1] != '/') break; /* The '/' is the last character, we have to look further. */ if (runp != path) last_slash = memrchr(path, '/', runp - path); } if (last_slash != NULL) { /* Determine whether all remaining characters are slashes. */ char *runp; for (runp = last_slash; runp != path; --runp) if (runp[-1] != '/') break; /* Terminate the path. */ if (runp == path) { /* * The last slash is the first character in the string. * We have to return "/". As a special case we have to * return "//" if there are exactly two slashes at the * beginning of the string. See XBD 4.10 Path Name * Resolution for more information */ if (last_slash == path + 1) ++last_slash; else last_slash = path + 1; } else last_slash = runp; last_slash[0] = '\0'; } else { /* * This assignment is ill-designed but the XPG specs require to * return a string containing "." in any case no directory part * is found and so a static and constant string is required. */ path = (char *)dot; } return path; } /** * Gets a hierarchical memory controller limit, or UINT64_MAX if no limit is * in place. If `swap` is true, reads 'swap' (v2) or 'memsw' (v1); otherwise * reads the memory (RAM) limits. * * @returns 0 on success (and sets `*limit`), < 0 on error */ static int get_min_memlimit(const char *cgroup, bool swap, uint64_t *limit) { __do_free char *copy = NULL; uint64_t memlimit = UINT64_MAX, retlimit = UINT64_MAX; int ret; copy = strdup(cgroup); if (!copy) return log_error_errno(0, ENOMEM, "Failed to allocate memory"); ret = get_memlimit(copy, swap, &retlimit); if (ret < 0) return ret; /* * If the cgroup doesn't start with / (probably won't happen), dirname() * will terminate with "" instead of "/" */ while (retlimit != 0 && *copy && strcmp(copy, "/") != 0) { char *it = copy; it = gnu_dirname(it); ret = get_memlimit(it, swap, &memlimit); if (ret < 0) return ret; if (memlimit < retlimit) retlimit = memlimit; } *limit = retlimit; return 0; } static inline bool startswith(const char *line, const char *pref) { return strncmp(line, pref, strlen(pref)) == 0; } static void get_swap_info(const char *cgroup, uint64_t memlimit, uint64_t memusage, uint64_t *swtotal, uint64_t *swusage, uint64_t *memswpriority) { __do_free char *memswusage_str = NULL, *memswpriority_str = NULL; uint64_t memswlimit = 0, memswusage = 0; int ret; *swtotal = *swusage = 0; *memswpriority = 1; ret = get_min_memlimit(cgroup, true, &memswlimit); if (ret < 0) return; ret = cgroup_ops->get_memory_swap_current(cgroup_ops, cgroup, &memswusage_str); if (ret < 0 || safe_uint64(memswusage_str, &memswusage, 10) < 0) return; if (liblxcfs_memory_is_cgroupv2()) { *swtotal = memswlimit / 1024; *swusage = memswusage / 1024; } else { if (memlimit > memswlimit) *swtotal = 0; else *swtotal = (memswlimit - memlimit) / 1024; if (memusage > memswusage || *swtotal == 0) *swusage = 0; else *swusage = (memswusage - memusage) / 1024; } ret = cgroup_ops->get_memory_swappiness(cgroup_ops, cgroup, &memswpriority_str); if (ret >= 0) safe_uint64(memswpriority_str, memswpriority, 10); } static int proc_swaps_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cgroup = NULL, *memusage_str = NULL, *memswusage_str = NULL, *memswpriority_str = NULL; struct fuse_context *fc = fuse_get_context(); bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON); struct file_info *d = INTTYPE_TO_PTR(fi->fh); uint64_t memlimit = 0, memusage = 0, swtotal = 0, swusage = 0, memswpriority = 1, hostswtotal = 0, hostswfree = 0; ssize_t total_len = 0; ssize_t l = 0; char *cache = d->buf; int ret; __do_free char *line = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; size_t linelen = 0; if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cgroup = get_pid_cgroup(initpid, "memory"); if (!cgroup) return read_file_fuse("/proc/swaps", buf, size, d); prune_init_slice(cgroup); ret = get_min_memlimit(cgroup, false, &memlimit); if (ret < 0) return 0; ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str); if (ret < 0) return 0; if (safe_uint64(memusage_str, &memusage, 10) < 0) lxcfs_error("Failed to convert memusage %s", memusage_str); if (wants_swap) get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority); total_len = snprintf(d->buf, d->size, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); /* Read host total and free values */ f = fopen_cached("/proc/meminfo", "re", &fopen_cache); if (!f) return 0; while (getline(&line, &linelen, f) != -1) { if (startswith(line, "SwapTotal:")) sscanf(line, "SwapTotal: %8" PRIu64 " kB", &hostswtotal); else if (startswith(line, "SwapFree:")) sscanf(line, "SwapFree: %8" PRIu64 " kB", &hostswfree); } if (wants_swap) { /* For cgroups v1, the total amount of swap is always reported to be the lesser of the RAM+SWAP limit or the SWAP device size. This is because the kernel can swap as much as it wants and not only up to swtotal. */ if (!liblxcfs_memory_is_cgroupv2()) swtotal = memlimit / 1024 + swtotal; if (hostswtotal < swtotal) { swtotal = hostswtotal; } /* When swappiness is 0, pretend we can't swap. */ if (memswpriority == 0) { swtotal = swusage; } } if (swtotal > 0) { l = snprintf(d->buf + total_len, d->size - total_len, "none%*svirtual\t\t%" PRIu64 "\t%" PRIu64 "\t0\n", 36, " ", swtotal, swusage); total_len += l; } if (total_len < 0 || l < 0) return log_error(0, "Failed writing to cache"); d->cached = 1; d->size = (int)total_len; if ((size_t)total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } static void get_blkio_io_value(char *str, unsigned major, unsigned minor, char *iotype, uint64_t *v) { char *eol; char key[32]; size_t len; memset(key, 0, 32); snprintf(key, 32, "%u:%u %s", major, minor, iotype); *v = 0; len = strlen(key); while (*str) { if (startswith(str, key)) { sscanf(str + len, "%" PRIu64, v); return; } eol = strchr(str, '\n'); if (!eol) return; str = eol + 1; } } struct lxcfs_diskstats { unsigned int major; /* 1 - major number */ unsigned int minor; /* 2 - minor mumber */ char dev_name[72]; /* 3 - device name */ uint64_t read; /* 4 - reads completed successfully */ uint64_t read_merged; /* 5 - reads merged */ uint64_t read_sectors; /* 6 - sectors read */ uint64_t read_ticks; /* 7 - time spent reading (ms) */ uint64_t write; /* 8 - writes completed */ uint64_t write_merged; /* 9 - writes merged */ uint64_t write_sectors; /* 10 - sectors written */ uint64_t write_ticks; /* 11 - time spent writing (ms) */ uint64_t ios_pgr; /* 12 - I/Os currently in progress */ uint64_t total_ticks; /* 13 - time spent doing I/Os (ms) */ uint64_t rq_ticks; /* 14 - weighted time spent doing I/Os (ms) */ uint64_t discard; /* 15 - discards completed successfully (4.18+) */ uint64_t discard_merged; /* 16 - discards merged (4.18+) */ uint64_t discard_sectors; /* 17 - sectors discarded (4.18+) */ uint64_t discard_ticks; /* 18 - time spent discarding (4.18+) */ }; static int proc_diskstats_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cg = NULL, *io_serviced_str = NULL, *io_merged_str = NULL, *io_service_bytes_str = NULL, *io_wait_time_str = NULL, *io_service_time_str = NULL, *line = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; struct fuse_context *fc = fuse_get_context(); struct file_info *d = INTTYPE_TO_PTR(fi->fh); struct lxcfs_diskstats stats = {}; /* helper fields */ uint64_t read_service_time, write_service_time, discard_service_time, read_wait_time, write_wait_time, discard_wait_time; char *cache = d->buf; size_t cache_size = d->buflen; size_t linelen = 0, total_len = 0; int i = 0; int ret; if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size: left; memcpy(buf, cache + offset, total_len); return total_len; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cg = get_pid_cgroup(initpid, "blkio"); if (!cg) return read_file_fuse("/proc/diskstats", buf, size, d); prune_init_slice(cg); ret = cgroup_ops->get_io_serviced(cgroup_ops, cg, &io_serviced_str); if (ret < 0) { if (ret == -EOPNOTSUPP) return read_file_fuse("/proc/diskstats", buf, size, d); } ret = cgroup_ops->get_io_merged(cgroup_ops, cg, &io_merged_str); if (ret < 0) { if (ret == -EOPNOTSUPP) return read_file_fuse("/proc/diskstats", buf, size, d); } ret = cgroup_ops->get_io_service_bytes(cgroup_ops, cg, &io_service_bytes_str); if (ret < 0) { if (ret == -EOPNOTSUPP) return read_file_fuse("/proc/diskstats", buf, size, d); } ret = cgroup_ops->get_io_wait_time(cgroup_ops, cg, &io_wait_time_str); if (ret < 0) { if (ret == -EOPNOTSUPP) return read_file_fuse("/proc/diskstats", buf, size, d); } ret = cgroup_ops->get_io_service_time(cgroup_ops, cg, &io_service_time_str); if (ret < 0) { if (ret == -EOPNOTSUPP) return read_file_fuse("/proc/diskstats", buf, size, d); } f = fopen_cached("/proc/diskstats", "re", &fopen_cache); if (!f) return 0; while (getline(&line, &linelen, f) != -1) { ssize_t l; char lbuf[256]; i = sscanf(line, "%u %u %71s", &stats.major, &stats.minor, stats.dev_name); if (i != 3) continue; get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Read", &stats.read); get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Write", &stats.write); get_blkio_io_value(io_serviced_str, stats.major, stats.minor, "Discard", &stats.discard); get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Read", &stats.read_merged); get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Write", &stats.write_merged); get_blkio_io_value(io_merged_str, stats.major, stats.minor, "Discard", &stats.discard_merged); get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Read", &stats.read_sectors); stats.read_sectors = stats.read_sectors / 512; get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Write", &stats.write_sectors); stats.write_sectors = stats.write_sectors / 512; get_blkio_io_value(io_service_bytes_str, stats.major, stats.minor, "Discard", &stats.discard_sectors); stats.discard_sectors = stats.discard_sectors / 512; get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Read", &read_service_time); read_service_time = read_service_time / 1000000; get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Read", &read_wait_time); read_wait_time = read_wait_time / 1000000; stats.read_ticks = read_service_time + read_wait_time; get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Write", &write_service_time); write_service_time = write_service_time / 1000000; get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Write", &write_wait_time); write_wait_time = write_wait_time / 1000000; stats.write_ticks = write_service_time + write_wait_time; get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Discard", &discard_service_time); discard_service_time = discard_service_time / 1000000; get_blkio_io_value(io_wait_time_str, stats.major, stats.minor, "Discard", &discard_wait_time); discard_wait_time = discard_wait_time / 1000000; stats.discard_ticks = discard_service_time + discard_wait_time; get_blkio_io_value(io_service_time_str, stats.major, stats.minor, "Total", &stats.total_ticks); stats.total_ticks = stats.total_ticks / 1000000; memset(lbuf, 0, sizeof(lbuf)); if (stats.read || stats.write || stats.read_merged || stats.write_merged || stats.read_sectors || stats.write_sectors || stats.read_ticks || stats.write_ticks || stats.ios_pgr || stats.total_ticks || stats.rq_ticks || stats.discard || stats.discard_merged || stats.discard_sectors || stats.discard_ticks) { ret = strnprintf( lbuf, sizeof(lbuf), "%u %u" /* major, minor */ " %s" /* dev_name */ " %" PRIu64 /* read */ " %" PRIu64 /* read_merged */ " %" PRIu64 /* read_sectors */ " %" PRIu64 /* read_ticks */ " %" PRIu64 /* write */ " %" PRIu64 /* write_merged */ " %" PRIu64 /* write_sectors */ " %" PRIu64 /* write_ticks */ " %" PRIu64 /* ios_pgr */ " %" PRIu64 /* total_ticks */ " %" PRIu64 /* rq_ticks */ " %" PRIu64 /* discard */ " %" PRIu64 /* discard_merged */ " %" PRIu64 /* discard_sectors */ " %" PRIu64 /* discard_ticks */ "\n", stats.major, stats.minor, stats.dev_name, stats.read, stats.read_merged, stats.read_sectors, stats.read_ticks, stats.write, stats.write_merged, stats.write_sectors, stats.write_ticks, stats.ios_pgr, stats.total_ticks, stats.rq_ticks, stats.discard, stats.discard_merged, stats.discard_sectors, stats.discard_ticks); if (ret < 0) { lxcfs_error("Insufficient buffer for %u:%u %s diskstats", stats.major, stats.minor, stats.dev_name); continue; } } else { continue; } l = snprintf(cache, cache_size, "%s", lbuf); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; } d->cached = 1; d->size = total_len; if (total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } #ifdef RELOADTEST static inline void iwashere(void) { mknod("/tmp/lxcfs-iwashere", S_IFREG, 0644); } #endif /* * This function retrieves the busy time of a group of tasks by looking at * cpuacct.usage. Unfortunately, this only makes sense when the container has * been given it's own cpuacct cgroup. If not, this function will take the busy * time of all other taks that do not actually belong to the container into * account as well. If someone has a clever solution for this please send a * patch! */ static double get_reaper_busy(pid_t task) { __do_free char *cgroup = NULL, *usage_str = NULL; uint64_t usage = 0; pid_t initpid; initpid = lookup_initpid_in_store(task); if (initpid <= 0) return 0; cgroup = get_pid_cgroup(initpid, "cpuacct"); if (!cgroup) return 0; prune_init_slice(cgroup); if (!cgroup_ops->get(cgroup_ops, "cpuacct", cgroup, "cpuacct.usage", &usage_str)) return 0; if (safe_uint64(usage_str, &usage, 10) < 0) lxcfs_error("Failed to convert usage %s", usage_str); return ((double)usage / 1000000000); } static uint64_t get_reaper_start_time(pid_t pid) { __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; int ret; uint64_t starttime; char path[STRLITERALLEN("/proc/") + LXCFS_NUMSTRLEN64 + STRLITERALLEN("/stat") + 1]; pid_t qpid; qpid = lookup_initpid_in_store(pid); if (qpid <= 0) return ret_errno(EINVAL); ret = snprintf(path, sizeof(path), "/proc/%d/stat", qpid); if (ret < 0 || (size_t)ret >= sizeof(path)) return ret_errno(EINVAL); f = fopen_cached(path, "re", &fopen_cache); if (!f) return ret_errno(EINVAL); /* Note that the *scanf() argument supression requires that length * modifiers such as "l" are omitted. Otherwise some compilers will yell * at us. It's like telling someone you're not married and then asking * if you can bring your wife to the party. */ ret = fscanf(f, "%*d " /* (1) pid %d */ "%*s " /* (2) comm %s */ "%*c " /* (3) state %c */ "%*d " /* (4) ppid %d */ "%*d " /* (5) pgrp %d */ "%*d " /* (6) session %d */ "%*d " /* (7) tty_nr %d */ "%*d " /* (8) tpgid %d */ "%*u " /* (9) flags %u */ "%*u " /* (10) minflt %lu */ "%*u " /* (11) cminflt %lu */ "%*u " /* (12) majflt %lu */ "%*u " /* (13) cmajflt %lu */ "%*u " /* (14) utime %lu */ "%*u " /* (15) stime %lu */ "%*d " /* (16) cutime %ld */ "%*d " /* (17) cstime %ld */ "%*d " /* (18) priority %ld */ "%*d " /* (19) nice %ld */ "%*d " /* (20) num_threads %ld */ "%*d " /* (21) itrealvalue %ld */ "%" PRIu64, /* (22) starttime %llu */ &starttime); if (ret != 1) return ret_errno(EINVAL); return ret_set_errno(starttime, 0); } static double get_reaper_start_time_in_sec(pid_t pid) { uint64_t clockticks, ticks_per_sec; int64_t ret; double res = 0; clockticks = get_reaper_start_time(pid); if (clockticks <= 0) return log_debug(0, "Failed to retrieve start time of pid %d", pid); ret = sysconf(_SC_CLK_TCK); if (ret < 0) return log_debug(0, "Failed to determine number of clock ticks in a second"); ticks_per_sec = (uint64_t)ret; res = (double)clockticks / ticks_per_sec; return res; } static double get_reaper_age(pid_t pid) { uint64_t uptime_ms; double procstart, procage; /* * We need to substract the time the process has started since system * boot minus the time when the system has started to get the actual * reaper age. */ procstart = get_reaper_start_time_in_sec(pid); procage = procstart; if (procstart > 0) { int ret; struct timespec spec; ret = clock_gettime(CLOCK_BOOTTIME, &spec); if (ret < 0) return 0; uptime_ms = (spec.tv_sec * 1000) + (spec.tv_nsec * 1e-6); procage = (uptime_ms - (procstart * 1000)) / 1000; } return procage; } /* * We read /proc/uptime and reuse its second field. * For the first field, we use the mtime for the reaper for * the calling pid as returned by getreaperage */ static int proc_uptime_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { struct fuse_context *fc = fuse_get_context(); struct file_info *d = INTTYPE_TO_PTR(fi->fh); char *cache = d->buf; ssize_t total_len = 0, ret = 0; double busytime, idletime, reaperage; #ifdef RELOADTEST iwashere(); #endif if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size : left; memcpy(buf, cache + offset, total_len); return total_len; } reaperage = get_reaper_age(fc->pid); /* * To understand why this is done, please read the comment to the * get_reaper_busy() function. */ idletime = reaperage; busytime = get_reaper_busy(fc->pid); if (reaperage >= busytime) idletime = reaperage - busytime; ret = snprintf(d->buf, d->buflen, "%.2lf %.2lf\n", reaperage, idletime); if (ret < 0 || ret >= d->buflen) return read_file_fuse("/proc/uptime", buf, size, d); total_len = ret; d->cached = 1; d->size = total_len; if ((size_t)total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } #define CPUALL_MAX_SIZE (BUF_RESERVE_SIZE / 2) static int proc_stat_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cg = NULL, *cpu_cg = NULL, *cpuset = NULL, *line = NULL; __do_free void *fopen_cache = NULL; __do_free struct cpuacct_usage *cg_cpu_usage = NULL; __do_fclose FILE *f = NULL; struct fuse_context *fc = fuse_get_context(); struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data; struct file_info *d = INTTYPE_TO_PTR(fi->fh); size_t linelen = 0, total_len = 0; int curcpu = -1; /* cpu numbering starts at 0 */ int physcpu = 0; uint64_t user = 0, nice = 0, system = 0, idle = 0, iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guest_nice = 0; uint64_t user_sum = 0, nice_sum = 0, system_sum = 0, idle_sum = 0, iowait_sum = 0, irq_sum = 0, softirq_sum = 0, steal_sum = 0, guest_sum = 0, guest_nice_sum = 0; char cpuall[CPUALL_MAX_SIZE]; /* reserve for cpu all */ char *cache = d->buf + CPUALL_MAX_SIZE; size_t cache_size = d->buflen - CPUALL_MAX_SIZE; int cg_cpu_usage_size = 0; if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size : left; memcpy(buf, d->buf + offset, total_len); return total_len; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; /* * when container run with host pid namespace initpid == 1, cgroup will "/" * we should return host os's /proc contents. * in some case cpuacct_usage.all in "/" will larger then /proc/stat */ if (initpid == 1) return read_file_fuse("/proc/stat", buf, size, d); cg = get_pid_cgroup(initpid, "cpuset"); if (!cg) return read_file_fuse("/proc/stat", buf, size, d); prune_init_slice(cg); cpu_cg = get_pid_cgroup(initpid, "cpu"); if (!cpu_cg) return read_file_fuse("/proc/stat", buf, size, d); prune_init_slice(cpu_cg); cpuset = get_cpuset(cg); if (!cpuset) return 0; f = fopen_cached("/proc/stat", "re", &fopen_cache); if (!f) return 0; /* Skip first system cpu line. */ if (getline(&line, &linelen, f) < 0) return log_error(0, "proc_stat_read read first line failed"); /* * Read cpuacct.usage_all for all CPUs. * If the cpuacct cgroup is present, it is used to calculate the container's * CPU usage. If not, values from the host's /proc/stat are used. */ if (read_cpuacct_usage_all(cg, cpuset, &cg_cpu_usage, &cg_cpu_usage_size) == 0) { if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) { total_len = cpuview_proc_stat(cg, cpu_cg, cpuset, cg_cpu_usage, cg_cpu_usage_size, f, d->buf, d->buflen); goto out; } } else { lxcfs_v("proc_stat_read failed to read from cpuacct, falling back to the host's /proc/stat"); } while (getline(&line, &linelen, f) != -1) { ssize_t l; char cpu_char[10]; /* That's a lot of cores */ char *c; uint64_t all_used, cg_used, new_idle; int ret, cpu_to_render; if (strlen(line) == 0) continue; if (sscanf(line, "cpu%9[^ ]", cpu_char) != 1) { /* not a ^cpuN line containing a number N, just print it */ l = snprintf(cache, cache_size, "%s", line); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; continue; } if (sscanf(cpu_char, "%d", &physcpu) != 1) continue; if (!cpu_in_cpuset(physcpu, cpuset)) continue; curcpu++; if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) cpu_to_render = curcpu; else cpu_to_render = physcpu; ret = sscanf( line, "%*s" /* */ " %" PRIu64 /* user */ " %" PRIu64 /* nice */ " %" PRIu64 /* system */ " %" PRIu64 /* idle */ " %" PRIu64 /* iowait */ " %" PRIu64 /* irq */ " %" PRIu64 /* softirq */ " %" PRIu64 /* steal */ " %" PRIu64 /* guest */ " %" PRIu64, /* guest_nice */ &user, &nice, &system, &idle, &iowait, &irq, &softirq, &steal, &guest, &guest_nice); if (ret != 10 || !cg_cpu_usage) { c = strchr(line, ' '); if (!c) continue; l = snprintf(cache, cache_size, "cpu%d%s", cpu_to_render, c); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; if (ret != 10) continue; } if (cg_cpu_usage) { if (physcpu >= cg_cpu_usage_size) break; all_used = user + nice + system + iowait + irq + softirq + steal + guest + guest_nice; cg_used = cg_cpu_usage[physcpu].user + cg_cpu_usage[physcpu].system; if (all_used >= cg_used) { new_idle = idle + (all_used - cg_used); } else { lxcfs_debug("cpu%d from %s has unexpected cpu time: %" PRIu64 " in /proc/stat, %" PRIu64 " in cpuacct.usage_all; unable to determine idle time", cpu_to_render, cg, all_used, cg_used); new_idle = idle; } l = snprintf(cache, cache_size, "cpu%d %" PRIu64 " 0 %" PRIu64 " %" PRIu64 " 0 0 0 0 0 0\n", cpu_to_render, cg_cpu_usage[physcpu].user, cg_cpu_usage[physcpu].system, new_idle); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; user_sum += cg_cpu_usage[physcpu].user; system_sum += cg_cpu_usage[physcpu].system; idle_sum += new_idle; } else { user_sum += user; nice_sum += nice; system_sum += system; idle_sum += idle; iowait_sum += iowait; irq_sum += irq; softirq_sum += softirq; steal_sum += steal; guest_sum += guest; guest_nice_sum += guest_nice; } } cache = d->buf; int cpuall_len = snprintf( cpuall, CPUALL_MAX_SIZE, "cpu " " %" PRIu64 /* user_sum */ " %" PRIu64 /* nice_sum */ " %" PRIu64 /* system_sum */ " %" PRIu64 /* idle_sum */ " %" PRIu64 /* iowait_sum */ " %" PRIu64 /* irq_sum */ " %" PRIu64 /* softirq_sum */ " %" PRIu64 /* steal_sum */ " %" PRIu64 /* guest_sum */ " %" PRIu64 /* guest_nice_sum */ "\n", user_sum, nice_sum, system_sum, idle_sum, iowait_sum, irq_sum, softirq_sum, steal_sum, guest_sum, guest_nice_sum); if (cpuall_len > 0 && cpuall_len < CPUALL_MAX_SIZE) { memcpy(cache, cpuall, cpuall_len); cache += cpuall_len; } else { /* shouldn't happen */ lxcfs_error("proc_stat_read copy cpuall failed, cpuall_len=%d", cpuall_len); cpuall_len = 0; } memmove(cache, d->buf + CPUALL_MAX_SIZE, total_len); total_len += cpuall_len; out: d->cached = 1; d->size = total_len; if (total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } /* Note that "memory.stat" in cgroup2 is hierarchical by default. */ static bool cgroup_parse_memory_stat(const char *cgroup, struct memory_stat *mstat) { __do_close int fd = -EBADF; __do_fclose FILE *f = NULL; __do_free char *line = NULL; __do_free void *fdopen_cache = NULL; bool unified; size_t len = 0; ssize_t linelen; fd = cgroup_ops->get_memory_stats_fd(cgroup_ops, cgroup); if (fd < 0) return false; f = fdopen_cached(fd, "re", &fdopen_cache); if (!f) return false; unified = pure_unified_layout(cgroup_ops); while ((linelen = getline(&line, &len, f)) != -1) { if (!unified && startswith(line, "hierarchical_memory_limit")) { sscanf(line, "hierarchical_memory_limit %" PRIu64, &(mstat->hierarchical_memory_limit)); } else if (!unified && startswith(line, "hierarchical_memsw_limit")) { sscanf(line, "hierarchical_memsw_limit %" PRIu64, &(mstat->hierarchical_memsw_limit)); } else if (startswith(line, unified ? "file" :"total_cache")) { sscanf(line, unified ? "file %" PRIu64 : "total_cache %" PRIu64, &(mstat->total_cache)); } else if (!unified && startswith(line, "total_rss")) { sscanf(line, "total_rss %" PRIu64, &(mstat->total_rss)); } else if (!unified && startswith(line, "total_rss_huge")) { sscanf(line, "total_rss_huge %" PRIu64, &(mstat->total_rss_huge)); } else if (startswith(line, unified ? "shmem" : "total_shmem")) { sscanf(line, unified ? "shmem %" PRIu64 : "total_shmem %" PRIu64, &(mstat->total_shmem)); } else if (startswith(line, unified ? "file_mapped" : "total_mapped_file")) { sscanf(line, unified ? "file_mapped %" PRIu64 : "total_mapped_file %" PRIu64, &(mstat->total_mapped_file)); } else if (!unified && startswith(line, "total_dirty")) { sscanf(line, "total_dirty %" PRIu64, &(mstat->total_dirty)); } else if (!unified && startswith(line, "total_writeback")) { sscanf(line, "total_writeback %" PRIu64, &(mstat->total_writeback)); } else if (!unified && startswith(line, "total_swap")) { sscanf(line, "total_swap %" PRIu64, &(mstat->total_swap)); } else if (!unified && startswith(line, "total_pgpgin")) { sscanf(line, "total_pgpgin %" PRIu64, &(mstat->total_pgpgin)); } else if (!unified && startswith(line, "total_pgpgout")) { sscanf(line, "total_pgpgout %" PRIu64, &(mstat->total_pgpgout)); } else if (startswith(line, unified ? "pgfault" : "total_pgfault")) { sscanf(line, unified ? "pgfault %" PRIu64 : "total_pgfault %" PRIu64, &(mstat->total_pgfault)); } else if (startswith(line, unified ? "pgmajfault" : "total_pgmajfault")) { sscanf(line, unified ? "pgmajfault %" PRIu64 : "total_pgmajfault %" PRIu64, &(mstat->total_pgmajfault)); } else if (startswith(line, unified ? "inactive_anon" : "total_inactive_anon")) { sscanf(line, unified ? "inactive_anon %" PRIu64 : "total_inactive_anon %" PRIu64, &(mstat->total_inactive_anon)); } else if (startswith(line, unified ? "active_anon" : "total_active_anon")) { sscanf(line, unified ? "active_anon %" PRIu64 : "total_active_anon %" PRIu64, &(mstat->total_active_anon)); } else if (startswith(line, unified ? "inactive_file" : "total_inactive_file")) { sscanf(line, unified ? "inactive_file %" PRIu64 : "total_inactive_file %" PRIu64, &(mstat->total_inactive_file)); } else if (startswith(line, unified ? "active_file" : "total_active_file")) { sscanf(line, unified ? "active_file %" PRIu64 : "total_active_file %" PRIu64, &(mstat->total_active_file)); } else if (startswith(line, unified ? "unevictable" : "total_unevictable")) { sscanf(line, unified ? "unevictable %" PRIu64 : "total_unevictable %" PRIu64, &(mstat->total_unevictable)); } else if (unified && startswith(line, "slab ")) { sscanf(line, "slab %" PRIu64, &(mstat->slab)); } else if (unified && startswith(line, "slab_reclaimable")) { sscanf(line, "slab_reclaimable %" PRIu64, &(mstat->slab_reclaimable)); } else if (unified && startswith(line, "slab_unreclaimable")) { sscanf(line, "slab_unreclaimable %" PRIu64, &(mstat->slab_unreclaimable)); } } return true; } static int proc_meminfo_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cgroup = NULL, *line = NULL, *memusage_str = NULL, *memswusage_str = NULL, *memswpriority_str = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; struct fuse_context *fc = fuse_get_context(); bool wants_swap = lxcfs_has_opt(fuse_get_context()->private_data, LXCFS_SWAP_ON); struct file_info *d = INTTYPE_TO_PTR(fi->fh); uint64_t memlimit = 0, memusage = 0, hosttotal = 0, swfree = 0, swusage = 0, swtotal = 0, memswpriority = 1; struct memory_stat mstat = {}; size_t linelen = 0, total_len = 0; char *cache = d->buf; size_t cache_size = d->buflen; int ret; if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size : left; memcpy(buf, cache + offset, total_len); return total_len; } pid_t initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cgroup = get_pid_cgroup(initpid, "memory"); if (!cgroup) return read_file_fuse("/proc/meminfo", buf, size, d); prune_init_slice(cgroup); /* memory limits */ ret = cgroup_ops->get_memory_current(cgroup_ops, cgroup, &memusage_str); if (ret < 0) return read_file_fuse("/proc/meminfo", buf, size, d); if (safe_uint64(memusage_str, &memusage, 10) < 0) lxcfs_error("Failed to convert memusage %s", memusage_str); if (!cgroup_parse_memory_stat(cgroup, &mstat)) return read_file_fuse("/proc/meminfo", buf, size, d); ret = get_min_memlimit(cgroup, false, &memlimit); if (ret < 0) return read_file_fuse("/proc/meminfo", buf, size, d); /* * Following values are allowed to fail, because swapaccount might be * turned off for current kernel. */ if (wants_swap) get_swap_info(cgroup, memlimit, memusage, &swtotal, &swusage, &memswpriority); f = fopen_cached("/proc/meminfo", "re", &fopen_cache); if (!f) return read_file_fuse("/proc/meminfo", buf, size, d); memusage /= 1024; memlimit /= 1024; while (getline(&line, &linelen, f) != -1) { ssize_t l; char *printme, lbuf[100]; memset(lbuf, 0, 100); if (startswith(line, "MemTotal:")) { sscanf(line+sizeof("MemTotal:")-1, "%" PRIu64, &hosttotal); if (memlimit == 0) memlimit = hosttotal; if (hosttotal < memlimit) memlimit = hosttotal; snprintf(lbuf, 100, "MemTotal: %8" PRIu64 " kB\n", memlimit); printme = lbuf; } else if (startswith(line, "MemFree:")) { snprintf(lbuf, 100, "MemFree: %8" PRIu64 " kB\n", memlimit - memusage); printme = lbuf; } else if (startswith(line, "MemAvailable:")) { snprintf(lbuf, 100, "MemAvailable: %8" PRIu64 " kB\n", memlimit - memusage + (mstat.total_active_file + mstat.total_inactive_file + mstat.slab_reclaimable) / 1024); printme = lbuf; } else if (startswith(line, "SwapTotal:")) { if (wants_swap) { uint64_t hostswtotal = 0; sscanf(line + STRLITERALLEN("SwapTotal:"), "%" PRIu64, &hostswtotal); /* In cgroups v1, the total amount of swap is always reported to be the lesser of the RAM+SWAP limit or the SWAP device size. This is because the kernel can swap as much as it wants and not only up to swtotal. */ if (!liblxcfs_memory_is_cgroupv2()) swtotal += memlimit; if (hostswtotal < swtotal) { swtotal = hostswtotal; } /* When swappiness is 0, pretend we can't swap. */ if (memswpriority == 0) { swtotal = swusage; } } snprintf(lbuf, 100, "SwapTotal: %8" PRIu64 " kB\n", swtotal); printme = lbuf; } else if (startswith(line, "SwapFree:")) { if (wants_swap) { swfree = swtotal - swusage; } snprintf(lbuf, 100, "SwapFree: %8" PRIu64 " kB\n", swfree); printme = lbuf; } else if (startswith(line, "Slab:")) { snprintf(lbuf, 100, "Slab: %8" PRIu64 " kB\n", mstat.slab / 1024); printme = lbuf; } else if (startswith(line, "Buffers:")) { snprintf(lbuf, 100, "Buffers: %8" PRIu64 " kB\n", (uint64_t)0); printme = lbuf; } else if (startswith(line, "Cached:")) { snprintf(lbuf, 100, "Cached: %8" PRIu64 " kB\n", mstat.total_cache / 1024); printme = lbuf; } else if (startswith(line, "SwapCached:")) { snprintf(lbuf, 100, "SwapCached: %8" PRIu64 " kB\n", (uint64_t)0); printme = lbuf; } else if (startswith(line, "Active:")) { snprintf(lbuf, 100, "Active: %8" PRIu64 " kB\n", (mstat.total_active_anon + mstat.total_active_file) / 1024); printme = lbuf; } else if (startswith(line, "Inactive:")) { snprintf(lbuf, 100, "Inactive: %8" PRIu64 " kB\n", (mstat.total_inactive_anon + mstat.total_inactive_file) / 1024); printme = lbuf; } else if (startswith(line, "Active(anon):")) { snprintf(lbuf, 100, "Active(anon): %8" PRIu64 " kB\n", mstat.total_active_anon / 1024); printme = lbuf; } else if (startswith(line, "Inactive(anon):")) { snprintf(lbuf, 100, "Inactive(anon): %8" PRIu64 " kB\n", mstat.total_inactive_anon / 1024); printme = lbuf; } else if (startswith(line, "Active(file):")) { snprintf(lbuf, 100, "Active(file): %8" PRIu64 " kB\n", mstat.total_active_file / 1024); printme = lbuf; } else if (startswith(line, "Inactive(file):")) { snprintf(lbuf, 100, "Inactive(file): %8" PRIu64 " kB\n", mstat.total_inactive_file / 1024); printme = lbuf; } else if (startswith(line, "Unevictable:")) { snprintf(lbuf, 100, "Unevictable: %8" PRIu64 " kB\n", mstat.total_unevictable / 1024); printme = lbuf; } else if (startswith(line, "Dirty:")) { snprintf(lbuf, 100, "Dirty: %8" PRIu64 " kB\n", mstat.total_dirty / 1024); printme = lbuf; } else if (startswith(line, "Writeback:")) { snprintf(lbuf, 100, "Writeback: %8" PRIu64 " kB\n", mstat.total_writeback / 1024); printme = lbuf; } else if (startswith(line, "AnonPages:")) { snprintf(lbuf, 100, "AnonPages: %8" PRIu64 " kB\n", (mstat.total_active_anon + mstat.total_inactive_anon - mstat.total_shmem) / 1024); printme = lbuf; } else if (startswith(line, "Mapped:")) { snprintf(lbuf, 100, "Mapped: %8" PRIu64 " kB\n", mstat.total_mapped_file / 1024); printme = lbuf; } else if (startswith(line, "SReclaimable:")) { snprintf(lbuf, 100, "SReclaimable: %8" PRIu64 " kB\n", mstat.slab_reclaimable / 1024); printme = lbuf; } else if (startswith(line, "SUnreclaim:")) { snprintf(lbuf, 100, "SUnreclaim: %8" PRIu64 " kB\n", mstat.slab_unreclaimable / 1024); printme = lbuf; } else if (startswith(line, "Shmem:")) { snprintf(lbuf, 100, "Shmem: %8" PRIu64 " kB\n", mstat.total_shmem / 1024); printme = lbuf; } else if (startswith(line, "ShmemHugePages:")) { snprintf(lbuf, 100, "ShmemHugePages: %8" PRIu64 " kB\n", (uint64_t)0); printme = lbuf; } else if (startswith(line, "ShmemPmdMapped:")) { snprintf(lbuf, 100, "ShmemPmdMapped: %8" PRIu64 " kB\n", (uint64_t)0); printme = lbuf; } else if (startswith(line, "AnonHugePages:")) { snprintf(lbuf, 100, "AnonHugePages: %8" PRIu64 " kB\n", mstat.total_rss_huge / 1024); printme = lbuf; } else { printme = line; } l = snprintf(cache, cache_size, "%s", printme); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; } d->cached = 1; d->size = total_len; if (total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } static int proc_slabinfo_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cgroup = NULL, *line = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; __do_close int fd = -EBADF; struct fuse_context *fc = fuse_get_context(); struct file_info *d = INTTYPE_TO_PTR(fi->fh); size_t linelen = 0, total_len = 0; char *cache = d->buf; size_t cache_size = d->buflen; pid_t initpid; if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size : left; memcpy(buf, cache + offset, total_len); return total_len; } initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cgroup = get_pid_cgroup(initpid, "memory"); if (!cgroup) return read_file_fuse("/proc/slabinfo", buf, size, d); prune_init_slice(cgroup); fd = cgroup_ops->get_memory_slabinfo_fd(cgroup_ops, cgroup); if (fd < 0) return read_file_fuse("/proc/slabinfo", buf, size, d); f = fdopen_cached(fd, "re", &fopen_cache); if (!f) return read_file_fuse("/proc/slabinfo", buf, size, d); while (getline(&line, &linelen, f) != -1) { ssize_t l = snprintf(cache, cache_size, "%s", line); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; } d->cached = 1; d->size = total_len; if (total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } static int proc_read_with_personality(int (*do_proc_read)(char *, size_t, off_t, struct fuse_file_info *), char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { struct fuse_context *fc = fuse_get_context(); __u32 host_personality = liblxcfs_personality(), caller_personality; bool change_personality; int ret, read_ret; if (get_task_personality(fc->pid, &caller_personality) < 0) return log_error(0, "Failed to get caller process (pid: %d) personality", fc->pid); /* do we need to change thread personality? */ change_personality = host_personality != caller_personality; if (change_personality) { ret = personality(caller_personality); if (ret == -1) return log_error(0, "Call to personality(%d) failed: %s\n", caller_personality, strerror(errno)); lxcfs_debug("task (tid: %d) personality was changed %d -> %d\n", (int)syscall(SYS_gettid), ret, caller_personality); } read_ret = do_proc_read(buf, size, offset, fi); if (change_personality) { ret = personality(host_personality); if (ret == -1) return log_error(0, "Call to personality(%d) failed: %s\n", host_personality, strerror(errno)); lxcfs_debug("task (tid: %d) personality was restored %d -> %d\n", (int)syscall(SYS_gettid), ret, host_personality); } return read_ret; } __lxcfs_fuse_ops int proc_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { struct file_info *f = INTTYPE_TO_PTR(fi->fh); switch (f->type) { case LXC_TYPE_PROC_MEMINFO: if (liblxcfs_functional()) return proc_meminfo_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_MEMINFO_PATH, buf, size, offset, f); case LXC_TYPE_PROC_CPUINFO: if (liblxcfs_functional()) { if (!can_access_personality()) return log_error(-EACCES, RESTRICTED_PERSONALITY_ACCESS_POLICY); return proc_read_with_personality(&proc_cpuinfo_read, buf, size, offset, fi); } return read_file_fuse_with_offset(LXC_TYPE_PROC_CPUINFO_PATH, buf, size, offset, f); case LXC_TYPE_PROC_UPTIME: if (liblxcfs_functional()) return proc_uptime_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_UPTIME_PATH, buf, size, offset, f); case LXC_TYPE_PROC_STAT: if (liblxcfs_functional()) return proc_stat_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_STAT_PATH, buf, size, offset, f); case LXC_TYPE_PROC_DISKSTATS: if (liblxcfs_functional()) return proc_diskstats_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_DISKSTATS_PATH, buf, size, offset, f); case LXC_TYPE_PROC_SWAPS: if (liblxcfs_functional()) return proc_swaps_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_SWAPS_PATH, buf, size, offset, f); case LXC_TYPE_PROC_LOADAVG: if (liblxcfs_functional()) return proc_loadavg_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_LOADAVG_PATH, buf, size, offset, f); case LXC_TYPE_PROC_SLABINFO: if (liblxcfs_functional()) return proc_slabinfo_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_PROC_SLABINFO_PATH, buf, size, offset, f); } return -EINVAL; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/proc_fuse.h0000664000175000017500000000150514773561567016061 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_PROC_FUSE_H #define __LXCFS_PROC_FUSE_H #include "config.h" #include "lxcfs_fuse.h" #include #include #include #include #include #include "macro.h" __visible extern int proc_getattr(const char *path, struct stat *sb); __visible extern int proc_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi); __visible extern int proc_open(const char *path, struct fuse_file_info *fi); __visible extern int proc_access(const char *path, int mask); __visible extern int proc_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi); __visible extern int proc_release(const char *path, struct fuse_file_info *fi); #endif /* __LXCFS_PROC_FUSE_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/proc_loadavg.c0000664000175000017500000004054214773561567016533 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "proc_loadavg.h" #include "bindings.h" #include "cgroup_fuse.h" #include "cgroups/cgroup.h" #include "cgroups/cgroup_utils.h" #include "memory_utils.h" #include "utils.h" /* * This parameter is used for proc_loadavg_read(). * 1 means use loadavg, 0 means not use. */ static int loadavg = 0; /* The function of hash table.*/ #define LOAD_SIZE 100 /*the size of hash_table */ #define FLUSH_TIME 5 /*the flush rate */ #define DEPTH_DIR 3 /*the depth of per cgroup */ /* The function of calculate loadavg .*/ #define FSHIFT (uint64_t)11 /* nr of bits of precision */ #define FIXED_1 ((uint64_t)1 << FSHIFT) /* 1.0 as fixed-point */ #define EXP_1 (uint64_t)1884 /* 1/exp(5sec/1min) as fixed-point */ #define EXP_5 (uint64_t)2014 /* 1/exp(5sec/5min) */ #define EXP_15 (uint64_t)2037 /* 1/exp(5sec/15min) */ #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1 - 1)) * (uint64_t)100) static volatile sig_atomic_t loadavg_stop = 0; struct load_node { /* cgroup */ char *cg; /* Load averages */ uint64_t avenrun[3]; unsigned int run_pid; unsigned int total_pid; unsigned int last_pid; /* The file descriptor of the mounted cgroup */ int cfd; struct load_node *next; struct load_node **pre; }; struct load_head { /* * To prevent ABBA deadlocks, let's always take this locks in * the order as they specified in this structure. */ /* * The lock is about insert load_node and refresh load_node.To the first * load_node of each hash bucket, insert and refresh in this hash bucket is * mutually exclusive. */ pthread_mutex_t lock; /* * The rdlock is about read loadavg and delete load_node.To each hash * bucket, read and delete is mutually exclusive. But at the same time, we * allow paratactic read operation. This rdlock is at list level. */ pthread_rwlock_t rdlock; /* * The rilock is about read loadavg and insert load_node.To the first * load_node of each hash bucket, read and insert is mutually exclusive. * But at the same time, we allow paratactic read operation. */ pthread_rwlock_t rilock; struct load_node *next; }; static struct load_head load_hash[LOAD_SIZE]; /* hash table */ /* * locate_node() finds special node. Not return NULL means success. * It should be noted that rdlock isn't unlocked at the end of code * because this function is used to read special node. Delete is not * allowed before read has ended. * unlock rdlock only in proc_loadavg_read(). */ static struct load_node *locate_node(char *cg, int locate) { struct load_node *f = NULL; int i = 0; pthread_rwlock_rdlock(&load_hash[locate].rdlock); pthread_rwlock_rdlock(&load_hash[locate].rilock); if (load_hash[locate].next == NULL) { pthread_rwlock_unlock(&load_hash[locate].rilock); return f; } f = load_hash[locate].next; pthread_rwlock_unlock(&load_hash[locate].rilock); while (f && ((i = strcmp(f->cg, cg)) != 0)) f = f->next; return f; } /* * Inserts a new load_node into the load_hash table, * if an appropriate node exists then just free (*n) and * rewrite (n) value to an existing load_node pointer. * * We should enter this function without any locks held. * This function leaves &load_hash[hash].rdlock taken. */ static void insert_node(struct load_node **n, int locate) { struct load_node *f; pthread_mutex_lock(&load_hash[locate].lock); /* * We have to recheck if the node we are looking for * has appeared in the hash table. In this case we just free * newly created load_node and give an existing load_node to use. */ f = locate_node((*n)->cg, locate); if (f) { free_disarm((*n)->cg); free_disarm((*n)); *n = f; pthread_mutex_unlock(&load_hash[locate].lock); return; } /* &load_hash[hash].rdlock is taken for read at this point */ pthread_rwlock_wrlock(&load_hash[locate].rilock); f = load_hash[locate].next; load_hash[locate].next = *n; (*n)->pre = &(load_hash[locate].next); if (f) f->pre = &((*n)->next); (*n)->next = f; pthread_mutex_unlock(&load_hash[locate].lock); pthread_rwlock_unlock(&load_hash[locate].rilock); } int calc_hash(const char *name) { unsigned int hash = 0; unsigned int x = 0; /* ELFHash algorithm. */ while (*name) { hash = (hash << 4) + *name++; x = hash & 0xf0000000; if (x != 0) hash ^= (x >> 24); hash &= ~x; } return (hash & 0x7fffffff); } int proc_loadavg_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cg = NULL; struct fuse_context *fc = fuse_get_context(); struct file_info *d = INTTYPE_TO_PTR(fi->fh); pid_t initpid; ssize_t total_len = 0; struct load_node *n; int hash; int cfd; uint64_t a, b, c; if (offset) { size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size : left; memcpy(buf, d->buf + offset, total_len); return total_len; } if (!loadavg) return read_file_fuse("/proc/loadavg", buf, size, d); initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cg = get_pid_cgroup(initpid, "cpu"); if (!cg) return read_file_fuse("/proc/loadavg", buf, size, d); prune_init_slice(cg); hash = calc_hash(cg) % LOAD_SIZE; n = locate_node(cg, hash); /* First time */ if (n == NULL) { cfd = get_cgroup_fd("cpu"); if (cfd < 0) { /* * In locate_node() above, pthread_rwlock_unlock() isn't used * because delete is not allowed before read has ended. */ pthread_rwlock_unlock(&load_hash[hash].rdlock); return read_file_fuse("/proc/loadavg", buf, size, d); } n = must_realloc(NULL, sizeof(struct load_node)); n->cg = move_ptr(cg); n->avenrun[0] = 0; n->avenrun[1] = 0; n->avenrun[2] = 0; n->run_pid = 0; n->total_pid = 1; n->last_pid = initpid; n->cfd = cfd; pthread_rwlock_unlock(&load_hash[hash].rdlock); insert_node(&n, hash); /* &load_hash[hash].rdlock is taken for reading at this point */ } a = n->avenrun[0] + (FIXED_1 / 200); b = n->avenrun[1] + (FIXED_1 / 200); c = n->avenrun[2] + (FIXED_1 / 200); total_len = snprintf(d->buf, d->buflen, "%" PRIu64 ".%02" PRIu64 " " "%" PRIu64 ".%02lu " "%" PRIu64 ".%02" PRIu64 " " "%d/" "%d " "%d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), n->run_pid, n->total_pid, n->last_pid); pthread_rwlock_unlock(&load_hash[hash].rdlock); if (total_len < 0 || total_len >= d->buflen) return log_error(0, "Failed to write to cache"); d->size = (int)total_len; d->cached = 1; if ((size_t)total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } /* * Find the process pid from cgroup path. * eg:from /sys/fs/cgroup/cpu/docker/containerid/cgroup.procs to find the process pid. * @pid_buf : put pid to pid_buf. * @dpath : the path of cgroup. eg: /docker/containerid or /docker/containerid/child-cgroup ... * @depth : the depth of cgroup in container. * @sum : return the number of pid. * @cfd : the file descriptor of the mounted cgroup. eg: /sys/fs/cgroup/cpu */ static int calc_pid(char ***pid_buf, const char *rel_path, int depth, int sum, int cfd) { __do_free char *line = NULL, *path = NULL; __do_free void *fdopen_cache = NULL; __do_close int fd = -EBADF; __do_fclose FILE *f = NULL; __do_closedir DIR *dir = NULL; struct dirent *file; size_t linelen = 0; int pd; fd = openat(cfd, rel_path, O_RDONLY | O_CLOEXEC); if (fd < 0) return sum; dir = fdopendir(fd); if (!dir) return sum; /* Transfer ownership to fdopendir(). */ move_fd(fd); while (((file = readdir(dir)) != NULL) && depth > 0) { if (strcmp(file->d_name, ".") == 0) continue; if (strcmp(file->d_name, "..") == 0) continue; if (file->d_type == DT_DIR) { __do_free char *path_next = NULL; path_next = must_make_path(rel_path, "/", file->d_name, NULL); pd = depth - 1; sum = calc_pid(pid_buf, path_next, pd, sum, cfd); } } path = must_make_path(rel_path, "/cgroup.procs", NULL); fd = openat(cfd, path, O_RDONLY | O_CLOEXEC); if (fd < 0) return sum; f = fdopen_cached(fd, "re", &fdopen_cache); if (!f) return sum; while (getline(&line, &linelen, f) != -1) { __do_free char *task_pid = NULL; char **pid; task_pid = strdup(line); if (!task_pid) return sum; pid = realloc(*pid_buf, sizeof(char *) * (sum + 1)); if (!pid) return sum; *pid_buf = pid; *(*pid_buf + sum) = move_ptr(task_pid); sum++; } return sum; } /* * calc_load calculates the load according to the following formula: * load1 = load0 * exp + active * (1 - exp) * * @load1: the new loadavg. * @load0: the former loadavg. * @active: the total number of running pid at this moment. * @exp: the fixed-point defined in the beginning. */ static uint64_t calc_load(uint64_t load, uint64_t exp, uint64_t active) { uint64_t newload; active = active > 0 ? active * FIXED_1 : 0; newload = load * exp + active * (FIXED_1 - exp); if (active >= load) newload += FIXED_1 - 1; return newload / FIXED_1; } /* * Return 0 means that container p->cg is closed. * Return -1 means that error occurred in refresh. * Positive num equals the total number of pid. */ static int refresh_load(struct load_node *p, const char *path) { char **idbuf = NULL; char proc_path[STRLITERALLEN("/proc//task//status") + 2 * INTTYPE_TO_STRLEN(pid_t) + 1]; int i, ret, run_pid = 0, total_pid = 0, last_pid = 0; size_t linelen = 0; int sum, length; struct dirent *file; idbuf = must_realloc(NULL, sizeof(char **)); sum = calc_pid(&idbuf, path, DEPTH_DIR, 0, p->cfd); if (!sum) goto out; for (i = 0; i < sum; i++) { __do_closedir DIR *dp = NULL; length = strlen(idbuf[i]) - 1; idbuf[i][length] = '\0'; ret = snprintf(proc_path, sizeof(proc_path), "/proc/%s/task", idbuf[i]); if (ret < 0 || (size_t)ret > sizeof(proc_path)) { i = sum; sum = -1; lxcfs_error("%s\n", "snprintf() failed in refresh_load."); goto err_out; } dp = opendir(proc_path); if (!dp) { lxcfs_error("Failed to open \"%s\"", proc_path); continue; } while ((file = readdir(dp)) != NULL) { __do_free char *line = NULL; __do_fclose FILE *f = NULL; if (strcmp(file->d_name, ".") == 0) continue; if (strcmp(file->d_name, "..") == 0) continue; total_pid++; /* We make the biggest pid become last_pid. */ ret = atof(file->d_name); last_pid = (ret > last_pid) ? ret : last_pid; ret = snprintf(proc_path, sizeof(proc_path), "/proc/%s/task/%s/status", idbuf[i], file->d_name); if (ret < 0 || (size_t)ret > sizeof(proc_path)) { i = sum; sum = -1; lxcfs_error("%s\n", "snprintf() failed in refresh_load."); goto err_out; } f = fopen(proc_path, "re"); if (!f) continue; while (getline(&line, &linelen, f) != -1) if ((line[0] == 'S') && (line[1] == 't')) break; if ((line[7] == 'R') || (line[7] == 'D')) run_pid++; } } /* Calculate the loadavg. */ p->avenrun[0] = calc_load(p->avenrun[0], EXP_1, run_pid); p->avenrun[1] = calc_load(p->avenrun[1], EXP_5, run_pid); p->avenrun[2] = calc_load(p->avenrun[2], EXP_15, run_pid); p->run_pid = run_pid; p->total_pid = total_pid; p->last_pid = last_pid; err_out: for (; i > 0; i--) free(idbuf[i - 1]); out: free(idbuf); return sum; } /* Delete the load_node n and return the next node of it. */ static struct load_node *del_node(struct load_node *n, int locate) { struct load_node *g; pthread_rwlock_wrlock(&load_hash[locate].rdlock); if (n->next == NULL) { *(n->pre) = NULL; } else { *(n->pre) = n->next; n->next->pre = n->pre; } g = n->next; free_disarm(n->cg); free_disarm(n); pthread_rwlock_unlock(&load_hash[locate].rdlock); return g; } /* * Traverse the hash table and update it. */ static void *load_begin(void *arg) { int first_node, sum; struct load_node *f; clock_t time1, time2; int sleep_time; for (;;) { if (loadavg_stop == 1) return NULL; time1 = clock(); for (int i = 0; i < LOAD_SIZE; i++) { pthread_mutex_lock(&load_hash[i].lock); if (load_hash[i].next == NULL) { pthread_mutex_unlock(&load_hash[i].lock); continue; } f = load_hash[i].next; first_node = 1; while (f) { __do_free char *path = NULL; path = must_make_path_relative(f->cg, NULL); sum = refresh_load(f, path); if (sum == 0) f = del_node(f, i); else f = f->next; /* load_hash[i].lock locks only on the first node.*/ if (first_node == 1) { first_node = 0; pthread_mutex_unlock(&load_hash[i].lock); } } } if (loadavg_stop == 1) return NULL; time2 = clock(); sleep_time = FLUSH_TIME - (int)((time2 - time1) / CLOCKS_PER_SEC); if ((sleep_time > 0) && (sleep_time <= FLUSH_TIME)) usleep(sleep_time * 1000000); } } /* * init_load initialize the hash table. * Return 0 on success, return -1 on failure. */ static int init_load(void) { int i; int ret; for (i = 0; i < LOAD_SIZE; i++) { load_hash[i].next = NULL; ret = pthread_mutex_init(&load_hash[i].lock, NULL); if (ret) { lxcfs_error("Failed to initialize lock"); goto out3; } ret = pthread_rwlock_init(&load_hash[i].rdlock, NULL); if (ret) { lxcfs_error("Failed to initialize rdlock"); goto out2; } ret = pthread_rwlock_init(&load_hash[i].rilock, NULL); if (ret) { lxcfs_error("Failed to initialize rilock"); goto out1; } } return 0; out1: pthread_rwlock_destroy(&load_hash[i].rdlock); out2: pthread_mutex_destroy(&load_hash[i].lock); out3: while (i > 0) { i--; pthread_mutex_destroy(&load_hash[i].lock); pthread_rwlock_destroy(&load_hash[i].rdlock); pthread_rwlock_destroy(&load_hash[i].rilock); } return -1; } static void load_free(void) { struct load_node *f, *p; for (int i = 0; i < LOAD_SIZE; i++) { pthread_mutex_lock(&load_hash[i].lock); pthread_rwlock_wrlock(&load_hash[i].rdlock); pthread_rwlock_wrlock(&load_hash[i].rilock); if (load_hash[i].next == NULL) { pthread_mutex_unlock(&load_hash[i].lock); pthread_mutex_destroy(&load_hash[i].lock); pthread_rwlock_unlock(&load_hash[i].rilock); pthread_rwlock_destroy(&load_hash[i].rilock); pthread_rwlock_unlock(&load_hash[i].rdlock); pthread_rwlock_destroy(&load_hash[i].rdlock); continue; } for (f = load_hash[i].next; f;) { free_disarm(f->cg); p = f->next; free_disarm(f); f = p; } pthread_mutex_unlock(&load_hash[i].lock); pthread_mutex_destroy(&load_hash[i].lock); pthread_rwlock_unlock(&load_hash[i].rilock); pthread_rwlock_destroy(&load_hash[i].rilock); pthread_rwlock_unlock(&load_hash[i].rdlock); pthread_rwlock_destroy(&load_hash[i].rdlock); } } /* Return a positive number on success, return 0 on failure. */ pthread_t load_daemon(int load_use) { int ret; pthread_t pid; ret = init_load(); if (ret == -1) return (pthread_t)log_error(0, "Initialize hash_table fails in load_daemon!"); ret = pthread_create(&pid, NULL, load_begin, NULL); if (ret != 0) { load_free(); return (pthread_t)log_error(0, "Create pthread fails in load_daemon!"); } /* use loadavg, here loadavg = 1 */ loadavg = load_use; return pid; } /* Return 0 on success, return -1 on failure. */ int load_daemon_v2(pthread_t *thread, int load_use) { int ret; ret = init_load(); if (ret == -1) return log_error(-1, "Initialize hash_table fails in load_daemon!"); ret = pthread_create(thread, NULL, load_begin, NULL); if (ret != 0) { load_free(); return log_error(-1, "%s - Create pthread fails in load_daemon!", strerror(ret)); } /* use loadavg, here loadavg = 1 */ loadavg = load_use; return 0; } /* Returns 0 on success. */ int stop_load_daemon(pthread_t pid) { int s; /* Signal the thread to gracefully stop */ loadavg_stop = 1; s = pthread_join(pid, NULL); /* Make sure sub thread has been canceled. */ if (s) return log_error(-1, "stop_load_daemon error: failed to join"); load_free(); loadavg_stop = 0; return 0; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/proc_loadavg.h0000664000175000017500000000120314773561567016527 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_PROC_LOADAVG_FUSE_H #define __LXCFS_PROC_LOADAVG_FUSE_H #include "config.h" #include #include #include #include #include #include "lxcfs_fuse.h" #include "macro.h" __visible extern pthread_t load_daemon(int load_use); __visible extern int load_daemon_v2(pthread_t *thread, int load_use); __visible extern int stop_load_daemon(pthread_t pid); extern int proc_loadavg_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi); extern int calc_hash(const char *name); #endif /* __LXCFS_PROC_LOADAVG_FUSE_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/syscall_numbers.h0000664000175000017500000000552414773561567017306 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_SYSCALL_NUMBERS_H #define __LXCFS_SYSCALL_NUMBERS_H #include "config.h" #include #include #include #include #include #include #include #ifndef __NR_pivot_root #if defined __i386__ #define __NR_pivot_root 217 #elif defined __x86_64__ #define __NR_pivot_root 155 #elif defined __arm__ #define __NR_pivot_root 218 #elif defined __aarch64__ #define __NR_pivot_root 218 #elif defined __s390__ #define __NR_pivot_root 217 #elif defined __powerpc__ #define __NR_pivot_root 203 #elif defined __sparc__ #define __NR_pivot_root 146 #elif defined __ia64__ #define __NR_pivot_root 183 #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_pivot_root 4216 #endif #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ #define __NR_pivot_root 6151 #endif #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_pivot_root 5151 #endif #elif defined __loongarch64 #define __NR_pivot_root 41 #else #define -1 #warning "__NR_pivot_root not defined for your architecture" #endif #endif #ifndef __NR_bpf #if defined __i386__ #define __NR_bpf 357 #elif defined __x86_64__ #define __NR_bpf 321 #elif defined __arm__ #define __NR_bpf 386 #elif defined __aarch64__ #define __NR_bpf 386 #elif defined __s390__ #define __NR_bpf 351 #elif defined __powerpc__ #define __NR_bpf 361 #elif defined __sparc__ #define __NR_bpf 349 #elif defined __ia64__ #define __NR_bpf 317 #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_bpf 4355 #endif #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ #define __NR_bpf 6319 #endif #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_bpf 5315 #endif #elif defined __loongarch64 #define __NR_bpf 280 #else #define -1 #warning "__NR_bpf not defined for your architecture" #endif #endif #ifndef __NR_pidfd_send_signal #if defined __alpha__ #define __NR_pidfd_send_signal 534 #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_pidfd_send_signal 4424 #endif #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ #define __NR_pidfd_send_signal 6424 #endif #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_pidfd_send_signal 5424 #endif #else #define __NR_pidfd_send_signal 424 #endif #endif #ifndef __NR_pidfd_open #if defined __alpha__ #define __NR_pidfd_open 544 #elif defined _MIPS_SIM #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ #define __NR_pidfd_open 4434 #endif #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ #define __NR_pidfd_open 6434 #endif #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ #define __NR_pidfd_open 5434 #endif #else #define __NR_pidfd_open 434 #endif #endif #endif /* __LXCFS_SYSCALL_NUMBERS_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/sysfs_fuse.c0000664000175000017500000003601214773561567016261 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "sysfs_fuse.h" #include "bindings.h" #include "memory_utils.h" #include "cgroups/cgroup.h" #include "lxcfs_fuse_compat.h" #include "utils.h" static off_t get_sysfile_size(const char *which); static int do_cpuset_read(char *cg, char *cpu_cg, char *buf, size_t buflen) { __do_free char *cpuset = NULL; struct fuse_context *fc = fuse_get_context(); struct lxcfs_opts *opts = (struct lxcfs_opts *)fc->private_data; int max_cpus = 0; ssize_t total_len = 0; bool use_view; cpuset = get_cpuset(cg); if (!cpuset) return 0; if (cgroup_ops->can_use_cpuview(cgroup_ops) && opts && opts->use_cfs) use_view = true; else use_view = false; if (use_view) max_cpus = max_cpu_count(cg, cpu_cg); if (use_view) { if (max_cpus > 1) total_len = snprintf(buf, buflen, "0-%d\n", max_cpus - 1); else total_len = snprintf(buf, buflen, "0\n"); } else { total_len = snprintf(buf, buflen, "%s\n", cpuset); } if (total_len < 0 || (size_t)total_len >= buflen) return log_error(0, "Failed to write to cache"); return total_len; } static int sys_devices_system_cpu_online_read(char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_free char *cg = NULL, *cpu_cg = NULL; struct fuse_context *fc = fuse_get_context(); struct file_info *d = INTTYPE_TO_PTR(fi->fh); char *cache = d->buf; pid_t initpid; ssize_t total_len = 0; if (offset) { size_t left; if (!d->cached) return 0; if (offset > d->size) return -EINVAL; left = d->size - offset; total_len = left > size ? size : left; memcpy(buf, cache + offset, total_len); return total_len; } initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cg = get_pid_cgroup(initpid, "cpuset"); if (!cg) return read_file_fuse("/sys/devices/system/cpu/online", buf, size, d); prune_init_slice(cg); cpu_cg = get_pid_cgroup(initpid, "cpu"); if (!cpu_cg) return read_file_fuse("/sys/devices/system/cpu/online", buf, size, d); prune_init_slice(cpu_cg); total_len = do_cpuset_read(cg, cpu_cg, d->buf, d->buflen); d->size = (int)total_len; d->cached = 1; if ((size_t)total_len > size) total_len = size; memcpy(buf, d->buf, total_len); return total_len; } static int sys_devices_system_cpu_online_getsize(const char *path) { __do_free char *cg = NULL, *cpu_cg = NULL; struct fuse_context *fc = fuse_get_context(); pid_t initpid; char buf[BUF_RESERVE_SIZE]; int buflen = sizeof(buf); initpid = lookup_initpid_in_store(fc->pid); if (initpid <= 1 || is_shared_pidns(initpid)) initpid = fc->pid; cg = get_pid_cgroup(initpid, "cpuset"); if (!cg) return get_sysfile_size(path); cpu_cg = get_pid_cgroup(initpid, "cpu"); if (!cpu_cg) return get_sysfile_size(path); prune_init_slice(cg); prune_init_slice(cpu_cg); return do_cpuset_read(cg, cpu_cg, buf, buflen); } static int filler_sys_devices_system_cpu(const char *path, void *buf, fuse_fill_dir_t filler) { __do_closedir DIR *dirp = NULL; struct dirent *dirent; dirp = opendir(path); if (!dirp) return -ENOENT; while ((dirent = readdir(dirp))) { if (dirent_fillerat(filler, dirp, dirent, buf, 0) != 0) return -ENOENT; } return 0; } static int get_st_mode(const char *path, mode_t *mode) { struct stat sb; int ret; ret = lstat(path, &sb); if (ret < 0) return -ENOENT; *mode = sb.st_mode; return 0; } static off_t get_sysfile_size(const char *which) { __do_fclose FILE *f = NULL; __do_free char *line = NULL; size_t len = 0; ssize_t sz, answer = 0; f = fopen(which, "re"); if (!f) return 0; while ((sz = getline(&line, &len, f)) != -1) answer += sz; return answer; } static int sys_getattr_legacy(const char *path, struct stat *sb) { struct timespec now; memset(sb, 0, sizeof(struct stat)); if (clock_gettime(CLOCK_REALTIME, &now) < 0) return -EINVAL; sb->st_uid = sb->st_gid = 0; sb->st_atim = sb->st_mtim = sb->st_ctim = now; if (strcmp(path, "/sys") == 0) { sb->st_mode = S_IFDIR | 00555; sb->st_nlink = 2; return 0; } if (strcmp(path, "/sys/devices") == 0) { sb->st_mode = S_IFDIR | 00555; sb->st_nlink = 2; return 0; } if (strcmp(path, "/sys/devices/system") == 0) { sb->st_mode = S_IFDIR | 00555; sb->st_nlink = 2; return 0; } if (strcmp(path, "/sys/devices/system/cpu") == 0) { sb->st_mode = S_IFDIR | 00555; sb->st_nlink = 2; return 0; } if (strcmp(path, "/sys/devices/system/cpu/online") == 0) { sb->st_size = sys_devices_system_cpu_online_getsize(path); sb->st_mode = S_IFREG | 00444; sb->st_nlink = 1; return 0; } return -ENOENT; } __lxcfs_fuse_ops int sys_getattr(const char *path, struct stat *sb) { int ret; struct timespec now; mode_t st_mode; if (!liblxcfs_functional()) return -EIO; if (!liblxcfs_can_use_sys_cpu()) return sys_getattr_legacy(path, sb); memset(sb, 0, sizeof(struct stat)); if (clock_gettime(CLOCK_REALTIME, &now) < 0) return -EINVAL; sb->st_uid = sb->st_gid = 0; sb->st_atim = sb->st_mtim = sb->st_ctim = now; ret = get_st_mode(path, &st_mode); if (ret) return -ENOENT; if (S_ISDIR(st_mode)) { sb->st_mode = st_mode; sb->st_nlink = 2; return 0; } if (S_ISREG(st_mode) || S_ISLNK(st_mode)) { if (strcmp(path, "/sys/devices/system/cpu/online") == 0) sb->st_size = sys_devices_system_cpu_online_getsize(path); else sb->st_size = get_sysfile_size(path); sb->st_mode = st_mode; sb->st_nlink = 1; return 0; } return -ENOENT; } __lxcfs_fuse_ops int sys_release(const char *path, struct fuse_file_info *fi) { do_release_file_info(fi); return 0; } __lxcfs_fuse_ops int sys_releasedir(const char *path, struct fuse_file_info *fi) { do_release_file_info(fi); return 0; } __lxcfs_fuse_ops int sys_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { __do_close int fd = -EBADF; struct file_info *f = INTTYPE_TO_PTR(fi->fh); if (!liblxcfs_functional()) return -EIO; if (f->type != LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_SUBFILE) return -EINVAL; return -EACCES; } static int sys_readdir_legacy(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { if (strcmp(path, "/sys") == 0) { if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "devices", buf, 0) != 0) return -ENOENT; return 0; } if (strcmp(path, "/sys/devices") == 0) { if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "system", buf, 0) != 0) return -ENOENT; return 0; } if (strcmp(path, "/sys/devices/system") == 0) { if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "cpu", buf, 0) != 0) return -ENOENT; return 0; } if (strcmp(path, "/sys/devices/system/cpu") == 0) { if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "online", buf, 0) != 0) return -ENOENT; return 0; } return 0; } __lxcfs_fuse_ops int sys_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { __do_closedir DIR *dirp = NULL; struct dirent *dirent; struct file_info *f = INTTYPE_TO_PTR(fi->fh); if (!liblxcfs_functional()) return -EIO; if (!liblxcfs_can_use_sys_cpu()) return sys_readdir_legacy(path, buf, filler, offset, fi); /* * When we reload LXCFS and we don't load the lxcfs binary itself * changes to such functions as lxcfs_opendir() aren't reflected so * sys_opendir() doesn't run but sys_readdir() does. We need to account * for that here. */ if (!f) return -EIO; switch (f->type) { case LXC_TYPE_SYS: if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "devices", buf, 0) != 0) return -ENOENT; return 0; case LXC_TYPE_SYS_DEVICES: if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "system", buf, 0) != 0) return -ENOENT; return 0; case LXC_TYPE_SYS_DEVICES_SYSTEM: if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "cpu", buf, 0) != 0) return -ENOENT; return 0; case LXC_TYPE_SYS_DEVICES_SYSTEM_CPU: if (dir_filler(filler, buf, ".", 0) != 0 || dir_filler(filler, buf, "..", 0) != 0 || dirent_filler(filler, path, "online", buf, 0) != 0) return -ENOENT; return filler_sys_devices_system_cpu(path, buf, filler); case LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_SUBDIR: dirp = opendir_flags(path, O_CLOEXEC | O_NOFOLLOW); if (!dirp) return -errno; while ((dirent = readdir(dirp))) { if (dirent_fillerat(filler, dirp, dirent, buf, 0) != 0) return -ENOENT; } return 0; } return -EINVAL; } __lxcfs_fuse_ops int sys_readlink(const char *path, char *buf, size_t size) { ssize_t ret; if (!liblxcfs_functional()) return -EIO; ret = readlink(path, buf, size); if (ret < 0) return -errno; if ((size_t)ret > size) return -1; buf[ret] = '\0'; return 0; } static int sys_open_legacy(const char *path, struct fuse_file_info *fi) { __do_free struct file_info *info = NULL; int type = -1; if (strcmp(path, "/sys/devices") == 0) type = LXC_TYPE_SYS_DEVICES; if (strcmp(path, "/sys/devices/system") == 0) type = LXC_TYPE_SYS_DEVICES_SYSTEM; if (strcmp(path, "/sys/devices/system/cpu") == 0) type = LXC_TYPE_SYS_DEVICES_SYSTEM_CPU; if (strcmp(path, "/sys/devices/system/cpu/online") == 0) type = LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE; if (type == -1) return -ENOENT; info = malloc(sizeof(*info)); if (!info) return -ENOMEM; memset(info, 0, sizeof(*info)); info->type = type; info->buflen = get_sysfile_size(path) + BUF_RESERVE_SIZE; info->buf = malloc(info->buflen); if (!info->buf) return -ENOMEM; memset(info->buf, 0, info->buflen); /* set actual size to buffer size */ info->size = info->buflen; fi->fh = PTR_TO_UINT64(move_ptr(info)); return 0; } __lxcfs_fuse_ops int sys_open(const char *path, struct fuse_file_info *fi) { __do_free struct file_info *info = NULL; int type = -1; if (!liblxcfs_functional()) return -EIO; if (!liblxcfs_can_use_sys_cpu()) return sys_open_legacy(path, fi); if (strcmp(path, "/sys/devices/system/cpu/online") == 0) { type = LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE; } else if (strncmp(path, "/sys/devices/system/cpu/", STRLITERALLEN("/sys/devices/system/cpu/")) == 0) { int ret; mode_t st_mode; ret = get_st_mode(path, &st_mode); if (ret) return ret; if (S_ISREG(st_mode)) type = LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_SUBFILE; } if (type == -1) return -ENOENT; info = malloc(sizeof(*info)); if (!info) return -ENOMEM; memset(info, 0, sizeof(*info)); info->type = type; info->buflen = get_sysfile_size(path) + BUF_RESERVE_SIZE; info->buf = malloc(info->buflen); if (!info->buf) return -ENOMEM; memset(info->buf, 0, info->buflen); /* set actual size to buffer size */ info->size = info->buflen; fi->fh = PTR_TO_UINT64(move_ptr(info)); return 0; } __lxcfs_fuse_ops int sys_opendir(const char *path, struct fuse_file_info *fi) { __do_free struct file_info *dir_info = NULL; int type = -1; if (!liblxcfs_functional()) return -EIO; if (strcmp(path, "/sys") == 0) { type = LXC_TYPE_SYS; } else if (strcmp(path, "/sys/devices") == 0) { type = LXC_TYPE_SYS_DEVICES; } else if (strcmp(path, "/sys/devices/system") == 0) { type = LXC_TYPE_SYS_DEVICES_SYSTEM; } else if (strcmp(path, "/sys/devices/system/cpu") == 0) { type = LXC_TYPE_SYS_DEVICES_SYSTEM_CPU; } else if (strncmp(path, "/sys/devices/system/cpu/", STRLITERALLEN("/sys/devices/system/cpu/")) == 0) { int ret; mode_t st_mode; ret = get_st_mode(path, &st_mode); if (ret) return ret; if (S_ISDIR(st_mode)) type = LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_SUBDIR; } if (type == -1) return -ENOENT; dir_info = malloc(sizeof(*dir_info)); if (!dir_info) return -ENOMEM; memset(dir_info, 0, sizeof(*dir_info)); dir_info->type = type; dir_info->buf = NULL; dir_info->file = NULL; dir_info->buflen = 0; fi->fh = PTR_TO_UINT64(move_ptr(dir_info)); return 0; } static int sys_access_legacy(const char *path, int mask) { if (strcmp(path, "/sys") == 0 && access(path, R_OK) == 0) return 0; if (strcmp(path, "/sys/devices") == 0 && access(path, R_OK) == 0) return 0; if (strcmp(path, "/sys/devices/system") == 0 && access(path, R_OK) == 0) return 0; if (strcmp(path, "/sys/devices/system/cpu") == 0 && access(path, R_OK) == 0) return 0; /* these are all read-only */ if ((mask & ~R_OK) != 0) return -EACCES; return 0; } __lxcfs_fuse_ops int sys_access(const char *path, int mask) { if (!liblxcfs_functional()) return -EIO; if (!liblxcfs_can_use_sys_cpu()) return sys_access_legacy(path, mask); return access(path, mask); } static int sys_read_legacy(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { struct file_info *f = INTTYPE_TO_PTR(fi->fh); switch (f->type) { case LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE: if (liblxcfs_functional()) return sys_devices_system_cpu_online_read(buf, size, offset, fi); return read_file_fuse_with_offset(LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE_PATH, buf, size, offset, f); case LXC_TYPE_SYS_DEVICES: break; case LXC_TYPE_SYS_DEVICES_SYSTEM: break; case LXC_TYPE_SYS_DEVICES_SYSTEM_CPU: break; } return -EINVAL; } __lxcfs_fuse_ops int sys_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { struct file_info *f = INTTYPE_TO_PTR(fi->fh); if (!liblxcfs_functional()) return -EIO; if (!liblxcfs_can_use_sys_cpu()) return sys_read_legacy(path, buf, size, offset, fi); switch (f->type) { case LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_ONLINE: return sys_devices_system_cpu_online_read(buf, size, offset, fi); case LXC_TYPE_SYS_DEVICES_SYSTEM_CPU_SUBFILE: return read_file_fuse_with_offset(path, buf, size, offset, f); } return -EINVAL; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/sysfs_fuse.h0000664000175000017500000000225114773561567016264 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_SYSFS_FUSE_H #define __LXCFS_SYSFS_FUSE_H #include "config.h" #include #include #include #include #include #include "lxcfs_fuse.h" #include "macro.h" __visible extern int sys_getattr(const char *path, struct stat *sb); __visible extern int sys_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi); __visible extern int sys_readlink(const char *path, char *buf, size_t size); __visible extern int sys_release(const char *path, struct fuse_file_info *fi); __visible extern int sys_releasedir(const char *path, struct fuse_file_info *fi); __visible extern int sys_open(const char *path, struct fuse_file_info *fi); __visible extern int sys_opendir(const char *path, struct fuse_file_info *fi); __visible extern int sys_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi); __visible extern int sys_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi); __visible extern int sys_access(const char *path, int mask); #endif /* __LXCFS_SYSFS_FUSE_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/utils.c0000664000175000017500000003553314773561567015237 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "utils.h" #include "bindings.h" #include "macro.h" #include "memory_utils.h" /* * append the given formatted string to *src. * src: a pointer to a char* in which to append the formatted string. * sz: the number of characters printed so far, minus trailing \0. * asz: the allocated size so far * format: string format. See printf for details. * ...: varargs. See printf for details. */ char *must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...) { char tmp[BUF_RESERVE_SIZE]; va_list args; int tmplen; va_start (args, format); tmplen = vsnprintf(tmp, BUF_RESERVE_SIZE, format, args); va_end(args); if (!*src || tmplen + *sz + 1 >= *asz) { char *str; do { str = realloc(*src, *asz + BUF_RESERVE_SIZE); } while (!str); *src = str; *asz += BUF_RESERVE_SIZE; } memcpy((*src) +*sz , tmp, tmplen+1); /* include the \0 */ *sz += tmplen; return *src; } /** * in_same_namespace - Check whether two processes are in the same namespace. * @pid1 - PID of the first process. * @pid2 - PID of the second process. * @ns - Name of the namespace to check. Must correspond to one of the names * for the namespaces as shown in /proc/= __NS_PATH_LEN) { errno = EFBIG; return -1; } return open(path, O_RDONLY | O_CLOEXEC); } void do_release_file_info(struct fuse_file_info *fi) { struct file_info *f; f = INTTYPE_TO_PTR(fi->fh); if (!f) return; fi->fh = 0; free_disarm(f->controller); free_disarm(f->cgroup); free_disarm(f->file); free_disarm(f->buf); free_disarm(f); } #define POLLIN_SET ( EPOLLIN | EPOLLHUP | EPOLLRDHUP ) bool wait_for_sock(int sock, int timeout) { __do_close int epfd = -EBADF; struct epoll_event ev; int ret; time_t now, starttime, deltatime; if ((starttime = time(NULL)) < 0) return false; epfd = epoll_create(1); if (epfd < 0) return log_error(false, "%m - Failed to create epoll socket"); ev.events = POLLIN_SET; ev.data.fd = sock; if (epoll_ctl(epfd, EPOLL_CTL_ADD, sock, &ev) < 0) return log_error(false, "Failed adding socket to epoll: %m"); again: if ((now = time(NULL)) < 0) return false; deltatime = (starttime + timeout) - now; if (deltatime < 0) return false; ret = epoll_wait(epfd, &ev, 1, 1000*deltatime + 1); if (ret < 0 && errno == EINTR) goto again; if (ret <= 0) return false; return true; } bool recv_creds(int sock, struct ucred *cred, char *v) { struct msghdr msg = {}; struct iovec iov; struct cmsghdr *cmsg; ssize_t ret; char cmsgbuf[CMSG_SPACE(sizeof(*cred))] = {}; char buf = '1'; int optval = 1; msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_control = cmsgbuf; msg.msg_controllen = sizeof(cmsgbuf); iov.iov_base = &buf; iov.iov_len = sizeof(buf); msg.msg_iov = &iov; msg.msg_iovlen = 1; *v = buf; ret = setsockopt(sock, SOL_SOCKET, SO_PASSCRED, &optval, sizeof(optval)); if (ret < 0) return log_error(false, "Failed to set passcred: %s\n", strerror(errno)); ret = write_nointr(sock, &buf, sizeof(buf)); if (ret != sizeof(buf)) return log_error(false, "Failed to start write on scm fd: %s\n", strerror(errno)); if (!wait_for_sock(sock, 2)) return log_error(false, "Timed out waiting for scm_cred: %s\n", strerror(errno)); ret = recvmsg(sock, &msg, MSG_DONTWAIT); if (ret < 0) return log_error(false, "Failed to receive scm_cred: %s\n", strerror(errno)); cmsg = CMSG_FIRSTHDR(&msg); if (cmsg && cmsg->cmsg_len == CMSG_LEN(sizeof(*cred)) && cmsg->cmsg_level == SOL_SOCKET && cmsg->cmsg_type == SCM_CREDENTIALS) { memcpy(cred, CMSG_DATA(cmsg), sizeof(*cred)); } *v = buf; return true; } static int msgrecv(int sockfd, void *buf, size_t len) { if (!wait_for_sock(sockfd, 2)) return -1; return recv(sockfd, buf, len, MSG_DONTWAIT); } int send_creds(int sock, struct ucred *cred, char v, bool pingfirst) { struct msghdr msg = { 0 }; struct iovec iov; struct cmsghdr *cmsg; char cmsgbuf[CMSG_SPACE(sizeof(*cred))]; char buf[1]; buf[0] = 'p'; if (pingfirst && msgrecv(sock, buf, 1) != 1) return log_error(SEND_CREDS_FAIL, "%s - Failed getting reply from server over socketpair: %d", strerror(errno), SEND_CREDS_FAIL); msg.msg_control = cmsgbuf; msg.msg_controllen = sizeof(cmsgbuf); cmsg = CMSG_FIRSTHDR(&msg); cmsg->cmsg_len = CMSG_LEN(sizeof(struct ucred)); cmsg->cmsg_level = SOL_SOCKET; cmsg->cmsg_type = SCM_CREDENTIALS; memcpy(CMSG_DATA(cmsg), cred, sizeof(*cred)); msg.msg_name = NULL; msg.msg_namelen = 0; buf[0] = v; iov.iov_base = buf; iov.iov_len = sizeof(buf); msg.msg_iov = &iov; msg.msg_iovlen = 1; if (sendmsg(sock, &msg, 0) < 0) { if (errno == 3) return log_error(SEND_CREDS_NOTSK, "%s - Failed at sendmsg: %d", strerror(errno), SEND_CREDS_NOTSK); return log_error(SEND_CREDS_FAIL, "%s - Failed at sendmsg: %d", strerror(errno), SEND_CREDS_FAIL); } return SEND_CREDS_OK; } int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d) { __do_free char *line = NULL; __do_fclose FILE *f = NULL; size_t linelen = 0, total_len = 0; char *cache = d->buf; size_t cache_size = d->buflen; f = fopen(path, "re"); if (!f) return 0; while (getline(&line, &linelen, f) != -1) { ssize_t l; l = snprintf(cache, cache_size, "%s", line); if (l < 0) return log_error(0, "Failed to write cache"); if ((size_t)l >= cache_size) return log_error(0, "Write to cache was truncated"); cache += l; cache_size -= l; total_len += l; } d->size = total_len; if (total_len > size) total_len = size; /* read from off 0 */ memcpy(buf, d->buf, total_len); if (d->size > (int)total_len) d->cached = d->size - total_len; return total_len; } int read_file_fuse_with_offset(const char *path, char *buf, size_t size, off_t offset, struct file_info *d) { if (offset) { ssize_t total_len = 0; char *cache = d->buf; size_t left; if (offset > d->size) return -EINVAL; if (!d->cached) return 0; left = d->size - offset; total_len = left > size ? size : left; memcpy(buf, cache + offset, total_len); return total_len; } return read_file_fuse(path, buf, size, d); } #define INITSCOPE "/init.scope" void prune_init_slice(char *cg) { char *point; size_t cg_len = strlen(cg), initscope_len = strlen(INITSCOPE); if (cg_len < initscope_len) return; point = cg + cg_len - initscope_len; if (strcmp(point, INITSCOPE) == 0) { if (point == cg) *(point + 1) = '\0'; else *point = '\0'; } } int wait_for_pid(pid_t pid) { int status, ret; if (pid <= 0) return -1; again: ret = waitpid(pid, &status, 0); if (ret == -1) { if (errno == EINTR) goto again; return -1; } if (ret != pid) goto again; if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) return -1; return 0; } static ssize_t read_nointr(int fd, void *buf, size_t count) { ssize_t ret; again: ret = read(fd, buf, count); if (ret < 0 && errno == EINTR) goto again; return ret; } static void *must_realloc(void *orig, size_t sz) { void *ret; do { ret = realloc(orig, sz); } while (!ret); return ret; } static char *fd_to_buf(int fd, size_t *length) { __do_free char *copy = NULL; if (!length) return NULL; *length = 0; for (;;) { ssize_t bytes_read; char buf[4096]; char *old = copy; bytes_read = read_nointr(fd, buf, sizeof(buf)); if (bytes_read < 0) return NULL; if (!bytes_read) break; copy = must_realloc(old, (*length + bytes_read) * sizeof(*old)); memcpy(copy + *length, buf, bytes_read); *length += bytes_read; } return move_ptr(copy); } static char *file_to_buf(const char *path, size_t *length) { __do_close int fd = -EBADF; if (!length) return NULL; fd = open(path, O_RDONLY | O_CLOEXEC); if (fd < 0) return NULL; return fd_to_buf(fd, length); } FILE *fopen_cached(const char *path, const char *mode, void **caller_freed_buffer) { __do_free char *buf = NULL; size_t len = 0; FILE *f; buf = file_to_buf(path, &len); if (!buf) return NULL; f = fmemopen(buf, len, mode); if (!f) return NULL; *caller_freed_buffer = move_ptr(buf); return f; } FILE *fdopen_cached(int fd, const char *mode, void **caller_freed_buffer) { __do_free char *buf = NULL; size_t len = 0; FILE *f; buf = fd_to_buf(fd, &len); if (!buf) return NULL; f = fmemopen(buf, len, mode); if (!f) return NULL; *caller_freed_buffer = move_ptr(buf); return f; } ssize_t write_nointr(int fd, const void *buf, size_t count) { ssize_t ret; do { ret = write(fd, buf, count); } while (ret < 0 && errno == EINTR); return ret; } int safe_uint64(const char *numstr, uint64_t *converted, int base) { char *err = NULL; uint64_t u; while (isspace(*numstr)) numstr++; if (*numstr == '-') return -EINVAL; errno = 0; u = strtoull(numstr, &err, base); if (errno == ERANGE && u == UINT64_MAX) return -ERANGE; if (err == numstr || *err != '\0') return -EINVAL; *converted = u; return 0; } int safe_uint32(const char *numstr, uint32_t *converted, int base) { char *err = NULL; unsigned long uli; while (isspace(*numstr)) numstr++; if (*numstr == '-') return -EINVAL; errno = 0; uli = strtoul(numstr, &err, base); if (errno == ERANGE && uli == UINT32_MAX) return -ERANGE; if (err == numstr || *err != '\0') return -EINVAL; *converted = (uint32_t)uli; return 0; } static int char_left_gc(const char *buffer, size_t len) { size_t i; for (i = 0; i < len; i++) { if (buffer[i] == ' ' || buffer[i] == '\t') continue; return i; } return 0; } static int char_right_gc(const char *buffer, size_t len) { int i; for (i = len - 1; i >= 0; i--) { if (buffer[i] == ' ' || buffer[i] == '\t' || buffer[i] == '\n' || buffer[i] == '\0') continue; return i + 1; } return 0; } char *trim_whitespace_in_place(char *buffer) { buffer += char_left_gc(buffer, strlen(buffer)); buffer[char_right_gc(buffer, strlen(buffer))] = '\0'; return buffer; } #define BATCH_SIZE 50 static int batch_realloc(char **mem, size_t oldlen, size_t newlen) { int newbatches = (newlen / BATCH_SIZE) + 1; int oldbatches = (oldlen / BATCH_SIZE) + 1; if (!*mem || newbatches > oldbatches) { char *tmp; tmp = realloc(*mem, newbatches * BATCH_SIZE); if (!tmp) return -ENOMEM; *mem = tmp; } return 0; } static int append_line(char **dest, size_t oldlen, char *new, size_t newlen) { int ret; size_t full = oldlen + newlen; ret = batch_realloc(dest, oldlen, full + 1); if (ret) return ret; memcpy(*dest + oldlen, new, newlen + 1); return 0; } /* Slurp in a whole file */ char *read_file_at(int dfd, const char *fnam, unsigned int o_flags) { __do_close int fd = -EBADF; __do_free char *buf = NULL, *line = NULL; __do_fclose FILE *f = NULL; size_t len = 0, fulllen = 0; int linelen; fd = openat(dfd, fnam, o_flags, 0); if (fd < 0) return NULL; f = fdopen(fd, "re"); if (!f) return NULL; /* Transfer ownership to fdopen(). */ move_fd(fd); while ((linelen = getline(&line, &len, f)) != -1) { if (append_line(&buf, fulllen, line, linelen)) return NULL; fulllen += linelen; } return move_ptr(buf); } DIR *opendir_flags(const char *path, int flags) { __do_close int dfd = -EBADF; DIR *dirp; dfd = open(path, O_DIRECTORY | flags); if (dfd < 0) return NULL; dirp = fdopendir(dfd); if (dirp) move_fd(dfd); /* Transfer ownership to fdopendir(). */ return dirp; } int get_task_personality(pid_t pid, __u32 *personality) { __do_close int fd = -EBADF; int ret = -1; char path[STRLITERALLEN("/proc//personality") + INTTYPE_TO_STRLEN(pid_t) + 1]; /* seq_printf(m, "%08x\n", task->personality); */ char buf[8 + 1]; ret = strnprintf(path, sizeof(path), "/proc/%d/personality", pid); if (ret < 0) return -1; fd = open(path, O_RDONLY | O_CLOEXEC); if (fd < 0) return -1; ret = read_nointr(fd, buf, sizeof(buf) - 1); if (ret >= 0) { buf[ret] = '\0'; if (personality != NULL && safe_uint32(buf, personality, 16) < 0) return log_error(-1, "Failed to convert personality %s", buf); } return ret; } /* This function checks whether system security policy (i.e. Yama LSM) allows personality access, by trying on init own one. This is required as it may be restricted by a ptrace access mode check (see PROC(5)), and `get_task_personality` function relies on this. */ bool can_access_personality(void) { static int could_access_init_personality = -1; /* init personality has never been accessed (cache is empty) */ if (could_access_init_personality == -1) { if (get_task_personality(1, NULL) < 0) { could_access_init_personality = 0; } else { could_access_init_personality = 1; } } return could_access_init_personality != 0; } #if !HAVE_STRLCPY size_t strlcpy(char *dest, const char *src, size_t size) { size_t ret = strlen(src); if (size) { size_t len = (ret >= size) ? size - 1 : ret; memcpy(dest, src, len); dest[len] = '\0'; } return ret; } #endif #if !HAVE_STRLCAT size_t strlcat(char *d, const char *s, size_t n) { size_t l = strnlen(d, n); if (l == n) return l + strlen(s); return l + strlcpy(d + l, s, n - l); } #endif ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/src/utils.h0000664000175000017500000000565314773561567015244 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef __LXCFS_UTILS_H #define __LXCFS_UTILS_H #include "config.h" #include #include #include #include #include #include #include #include "lxcfs_fuse.h" #include "macro.h" #include "syscall_numbers.h" /* Reserve buffer size to account for file size changes. */ #define BUF_RESERVE_SIZE 512 #define SEND_CREDS_OK 0 #define SEND_CREDS_NOTSK 1 #define SEND_CREDS_FAIL 2 #define RESTRICTED_PERSONALITY_ACCESS_POLICY "Due to restricted personality access policy, reading proc files from containers is not permitted" struct file_info; __attribute__((__format__(__printf__, 4, 5))) extern char *must_strcat(char **src, size_t *sz, size_t *asz, const char *format, ...); extern bool is_shared_pidns(pid_t pid); extern int preserve_ns(const int pid, const char *ns); extern void do_release_file_info(struct fuse_file_info *fi); extern bool recv_creds(int sock, struct ucred *cred, char *v); extern int send_creds(int sock, struct ucred *cred, char v, bool pingfirst); extern bool wait_for_sock(int sock, int timeout); extern int read_file_fuse(const char *path, char *buf, size_t size, struct file_info *d); extern int read_file_fuse_with_offset(const char *path, char *buf, size_t size, off_t offset, struct file_info *d); extern void prune_init_slice(char *cg); extern int wait_for_pid(pid_t pid); #if !HAVE_PIDFD_OPEN static inline int pidfd_open(pid_t pid, unsigned int flags) { return syscall(__NR_pidfd_open, pid, flags); } #endif #if !HAVE_PIDFD_SEND_SIGNAL static inline int pidfd_send_signal(int pidfd, int sig, siginfo_t *info, unsigned int flags) { return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags); } #endif extern FILE *fopen_cached(const char *path, const char *mode, void **caller_freed_buffer); extern FILE *fdopen_cached(int fd, const char *mode, void **caller_freed_buffer); extern DIR *opendir_flags(const char *path, int oflags); extern ssize_t write_nointr(int fd, const void *buf, size_t count); extern int safe_uint64(const char *numstr, uint64_t *converted, int base); extern int safe_uint32(const char *numstr, uint32_t *converted, int base); extern char *trim_whitespace_in_place(char *buffer); static inline bool file_exists(const char *f) { struct stat statbuf; return stat(f, &statbuf) == 0; } #define PROTECT_OPEN_WITH_TRAILING_SYMLINKS (O_CLOEXEC | O_NOCTTY | O_RDONLY) #define PROTECT_OPEN (PROTECT_OPEN_WITH_TRAILING_SYMLINKS | O_NOFOLLOW) extern char *read_file_at(int dfd, const char *fnam, unsigned int o_flags); extern int get_task_personality(pid_t pid, __u32 *personality); extern bool can_access_personality(void); extern int get_host_personality(__u32 *personality); #if !HAVE_STRLCPY extern size_t strlcpy(char *, const char *, size_t); #endif #if !HAVE_STRLCAT extern size_t strlcat(char *d, const char *s, size_t n); #endif #endif /* __LXCFS_UTILS_H */ ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/0000775000175000017500000000000014773561567014275 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/cpusetrange.c0000664000175000017500000000201014773561567016752 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include "../src/cpuset_parse.h" static void verify(bool condition) { if (condition) { printf(" PASS\n"); } else { printf(" FAIL!\n"); exit(1); } } int main(void) { char *a = "1,2"; char *b = "1-3,5"; char *c = "1,4-5"; char *d = ""; char *e = "\n"; printf("1 in %s", a); verify(cpu_in_cpuset(1, a)); printf("2 in %s", a); verify(cpu_in_cpuset(2, a)); printf("NOT 4 in %s", a); verify(!cpu_in_cpuset(4, a)); printf("1 in %s", b); verify(cpu_in_cpuset(1, b)); printf("NOT 4 in %s", b); verify(!cpu_in_cpuset(4, b)); printf("5 in %s", b); verify(cpu_in_cpuset(5, b)); printf("1 in %s", c); verify(cpu_in_cpuset(1, c)); printf("5 in %s", c); verify(cpu_in_cpuset(5, c)); printf("NOT 6 in %s", c); verify(!cpu_in_cpuset(6, c)); printf("NOT 6 in empty set"); verify(!cpu_in_cpuset(6, d)); printf("NOT 6 in empty set(2)"); verify(!cpu_in_cpuset(6, e)); } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/live-upgrade-test.sh.in0000775000175000017500000000543314773561567020607 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x [ $(id -u) -eq 0 ] NEW_LXCFS_TREE=$1 echo "LXCFS trees:" pwd echo "${NEW_LXCFS_TREE}" # Run lxcfs testsuite export LXCFSDIR=$(mktemp -d) pidfile=$(mktemp) export LXCFSPID=-1 cmdline=$(realpath $0) dirname=$(dirname ${cmdline}) FAILED=1 UNSHARE=1 cleanup() { echo "=> Cleaning up" set +e if [ $LXCFSPID -ne -1 ]; then kill -9 $LXCFSPID fi if [ ${LXCFSDIR} != "/var/lib/lxcfs" ]; then umount -l ${LXCFSDIR} rmdir ${LXCFSDIR} fi rm -f ${pidfile} if [ ${FAILED} -eq 1 ]; then echo "=> FAILED at $TESTCASE" exit 1 fi echo "=> PASSED" exit 0 } TESTCASE="setup" lxcfs="{{LXCFS_BUILD_ROOT}}/lxcfs" if [ -x ${lxcfs} ]; then if [ -n "${LD_LIBRARY_PATH:-}" ]; then export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}:${LD_LIBRARY_PATH}" else export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}" fi echo "=> Spawning ${lxcfs} ${LXCFSDIR}" ${lxcfs} --enable-cgroup -p ${pidfile} ${LXCFSDIR} & LXCFSPID=$! else UNSHARE=0 LXCFSPID=$(cat "{{DEFAULT_RUNTIME_PATH}}/lxcfs.pid") echo "=> Re-using host lxcfs" rmdir $LXCFSDIR export LXCFSDIR=/var/lib/lxcfs fi trap cleanup EXIT HUP INT TERM count=1 while ! mountpoint -q $LXCFSDIR; do sleep 1s if [ $count -gt 5 ]; then echo "lxcfs failed to start" false fi count=$((count+1)) done RUNTEST() { echo "" echo "=> Running ${TESTCASE}" if [ "${UNSHARE:-1}" != "0" ]; then unshare -fmp --mount-proc $* else $* fi } RUNTESTS() { TESTCASE="Stress readdir" RUNTEST ${dirname}/test_readdir TESTCASE="test_proc" RUNTEST ${dirname}/test_proc TESTCASE="test_cgroup" RUNTEST ${dirname}/test_cgroup TESTCASE="test_read_proc.sh" RUNTEST ${dirname}/test_read_proc.sh TESTCASE="cpusetrange" RUNTEST ${dirname}/test-cpusetrange TESTCASE="meminfo hierarchy" RUNTEST ${dirname}/test_meminfo_hierarchy.sh TESTCASE="SIGUSR2 virtualization mode switching" echo "==> Switching to non-virtualization mode" kill -USR2 $LXCFSPID RUNTEST ${dirname}/test_sigusr2.sh echo "==> Switching to virtualization mode" kill -USR2 $LXCFSPID } echo "" echo "=> Running tests BEFORE reload" RUNTESTS TESTCASE="liblxcfs reloading (with upgrade)" rm -f /tmp/lxcfs-iwashere echo "==> Ensure that lxcfs is functional BEFORE reload" cat ${LXCFSDIR}/proc/uptime libdir="{{LXCFS_BUILD_ROOT}}" [ ! -f /tmp/lxcfs-iwashere ] rm -f ${libdir}/liblxcfs.so ${libdir}/liblxcfs.la cp ${NEW_LXCFS_TREE}/build/liblxcfstest.so ${libdir}/liblxcfs.so echo "==> Reload liblxcfs" kill -USR1 $LXCFSPID sleep 1 echo "==> Ensure that lxcfs is functional AFTER reload" cat ${LXCFSDIR}/proc/uptime sleep 1 [ -f /tmp/lxcfs-iwashere ] echo "" echo "=> Running tests AFTER reload" RUNTESTS # Check for any defunct processes - children we didn't reap n=`ps -ef | grep lxcfs | grep defunct | wc -l` [ $n = 0 ] FAILED=0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/main.sh.in0000775000175000017500000000421714773561567016171 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x [ $(id -u) -eq 0 ] # Run lxcfs testsuite export LXCFSDIR=$(mktemp -d) pidfile=$(mktemp) export LXCFSPID=-1 cmdline=$(realpath $0) dirname=$(dirname ${cmdline}) FAILED=1 UNSHARE=1 cleanup() { echo "=> Cleaning up" set +e if [ $LXCFSPID -ne -1 ]; then kill -9 $LXCFSPID fi if [ ${LXCFSDIR} != "/var/lib/lxcfs" ]; then umount -l ${LXCFSDIR} rmdir ${LXCFSDIR} fi rm -f ${pidfile} if [ ${FAILED} -eq 1 ]; then echo "=> FAILED at $TESTCASE" exit 1 fi echo "=> PASSED" exit 0 } TESTCASE="setup" lxcfs="{{LXCFS_BUILD_ROOT}}/lxcfs" if [ -x ${lxcfs} ]; then if [ -n "${LD_LIBRARY_PATH:-}" ]; then export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}:${LD_LIBRARY_PATH}" else export LD_LIBRARY_PATH="{{LXCFS_BUILD_ROOT}}" fi echo "=> Spawning ${lxcfs} ${LXCFSDIR}" ${lxcfs} --enable-cgroup -p ${pidfile} ${LXCFSDIR} & LXCFSPID=$! else UNSHARE=0 LXCFSPID=$(cat "{{DEFAULT_RUNTIME_PATH}}/lxcfs.pid") echo "=> Re-using host lxcfs" rmdir $LXCFSDIR export LXCFSDIR=/var/lib/lxcfs fi trap cleanup EXIT HUP INT TERM count=1 while ! mountpoint -q $LXCFSDIR; do sleep 1s if [ $count -gt 5 ]; then echo "lxcfs failed to start" false fi count=$((count+1)) done RUNTEST() { echo "" echo "=> Running ${TESTCASE}" if [ "${UNSHARE:-1}" != "0" ]; then unshare -fmp --mount-proc $* else $* fi } TESTCASE="Stress readdir" RUNTEST ${dirname}/test_readdir TESTCASE="test_proc" RUNTEST ${dirname}/test_proc TESTCASE="test_cgroup" RUNTEST ${dirname}/test_cgroup TESTCASE="test_read_proc.sh" RUNTEST ${dirname}/test_read_proc.sh TESTCASE="cpusetrange" RUNTEST ${dirname}/test-cpusetrange TESTCASE="meminfo hierarchy" RUNTEST ${dirname}/test_meminfo_hierarchy.sh TESTCASE="liblxcfs reloading" UNSHARE=0 RUNTEST ${dirname}/test_reload.sh TESTCASE="SIGUSR2 virtualization mode switching" echo "==> Switching to non-virtualization mode" kill -USR2 $LXCFSPID RUNTEST ${dirname}/test_sigusr2.sh echo "==> Switching to virtualization mode" kill -USR2 $LXCFSPID # Check for any defunct processes - children we didn't reap n=`ps -ef | grep lxcfs | grep defunct | wc -l` [ $n = 0 ] FAILED=0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/meson.build0000664000175000017500000000650714773561567016447 0ustar00stgraberstgraber# SPDX-License-Identifier: LGPL-2.1-or-later test_programs += custom_target( 'main.sh', build_by_default: want_tests != false, input: 'main.sh.in', output: 'main.sh', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'live-upgrade-test.sh', build_by_default: want_tests != false, input: 'live-upgrade-test.sh.in', output: 'live-upgrade-test.sh', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_cgroup', build_by_default: want_tests != false, input: 'test_cgroup.in', output: 'test_cgroup', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_confinement.sh', build_by_default: want_tests != false, input: 'test_confinement.sh.in', output: 'test_confinement.sh', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_meminfo_hierarchy.sh', build_by_default: want_tests != false, input: 'test_meminfo_hierarchy.sh.in', output: 'test_meminfo_hierarchy.sh', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_proc', build_by_default: want_tests != false, input: 'test_proc.in', output: 'test_proc', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_read_proc.sh', build_by_default: want_tests != false, input: 'test_read_proc.sh.in', output: 'test_read_proc.sh', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_readdir', build_by_default: want_tests != false, input: 'test_readdir.in', output: 'test_readdir', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_reload.sh', build_by_default: want_tests != false, input: 'test_reload.sh.in', output: 'test_reload.sh', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += custom_target( 'test_sigusr2.sh', build_by_default: want_tests != false, input: 'test_sigusr2.sh.in', output: 'test_sigusr2.sh', command: [ meson_render_jinja2, config_h, '@INPUT@', '@OUTPUT@', ]) test_programs += executable( 'test-read', 'test-read.c', include_directories: config_include, install: false, build_by_default: want_tests != false) test_programs += executable( 'test-syscalls', 'test_syscalls.c', include_directories: config_include, install: false, build_by_default: want_tests != false) test_cpusetrange_sources = files( 'cpusetrange.c', '../src/cpuset_parse.c', '../src/cpuset_parse.h') test_programs += executable( 'test-cpusetrange', test_cpusetrange_sources, include_directories: config_include, install: false, build_by_default: want_tests != false) ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test-read.c0000664000175000017500000000177014773561567016336 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #define BUFSIZE 1025 char buf[BUFSIZE]; int read_count = 2; int main(int argc, char *argv[]){ if(argc < 3){ fprintf(stderr, "usage: %s [buffer|direct]\n", argv[0]); exit(1); } char *file = argv[1]; read_count = atoi(argv[2]); int ret = 0,sum = 0, i = 0, fd = -1; if(argc == 4 && strncmp(argv[3], "direct",6) == 0) fd = open(file, O_RDONLY|O_DIRECT); else fd = open(file, O_RDONLY); while(i++ < read_count){ memset(buf, 0, BUFSIZE); ret = read(fd, buf, BUFSIZE-1); if(ret > 0){ write(STDOUT_FILENO, buf, ret); sum += ret; }else if(ret == 0){ printf("======read end======\n"); break; }else{ printf("error:%d\n", errno); break; } sleep(1); } printf("======read sum: %d======\n", sum); if (fd >= 0) close(fd); return 0; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_cgroup.in0000775000175000017500000000334414773561567017172 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x PASS=0 UUID=$(uuidgen) cleanup() { [ "$PASS" = "1" ] || (echo FAIL && exit 1) } LXCFSDIR=${LXCFSDIR:-/var/lib/lxcfs} trap cleanup EXIT HUP INT TERM if ! mountpoint -q ${LXCFSDIR}; then echo "lxcfs isn't mounted on ${LXCFSDIR}" exit 1 fi IS_CGROUP_V2=0 grep -qF 'cgroup cgroup' /proc/1/mountinfo || IS_CGROUP_V2=1 # # LXCFS cgroupfs emulation doesn't make any sense with cgroup2 # if [ "$IS_CGROUP_V2" = "1" ]; then PASS=1 exit 0 fi echo "==> Setting up memory, freeze and cpuset cgroups" for c in memory freezer cpuset; do [ ! -d /sys/fs/cgroup/${c} ] && exit 0 done initcpuset=`awk -F: '/cpuset/ { print $3 }' /proc/1/cgroup` initmemory=`awk -F: '/memory/ { print $3 }' /proc/1/cgroup` initfreezer=`awk -F: '/freezer/ { print $3 }' /proc/1/cgroup` cpupath=/sys/fs/cgroup/cpuset/${initcpuset} mempath=/sys/fs/cgroup/memory/${initmemory} frzpath=/sys/fs/cgroup/freezer/${initfreezer} rmdir ${cpupath}/${UUID} 2>/dev/null || true rmdir ${mempath}/${UUID} 2>/dev/null || true rmdir ${frzpath}/${UUID} 2>/dev/null || true mkdir ${cpupath}/${UUID} mkdir ${mempath}/${UUID} mkdir ${frzpath}/${UUID} # Check that the fs is readable for p in ${mempath} ${frzpath} ${cpupath}; do echo "==> Test that ${p} is readable" find ${p} > /dev/null echo 1 > ${p}/${UUID}/tasks done echo "==> Testing memory.limit_in_bytes" echo $((64*1024*1024)) > ${LXCFSDIR}/cgroup/memory/${initmemory}/${UUID}/memory.limit_in_bytes v=`cat $mempath/${UUID}/memory.limit_in_bytes` [ "$v" = "$((64*1024*1024))" ] echo "==> Testing cpuset.cpus" echo 0 > ${LXCFSDIR}/cgroup/cpuset/${initcpuset}/${UUID}/cpuset.cpus v=`cat ${cpupath}/${UUID}/cpuset.cpus` [ "$v" = "0" ] PASS=1 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_confinement.sh.in0000775000175000017500000000424114773561567020606 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -ex UUID=$(uuidgen) [ $(id -u) -eq 0 ] d=$(mktemp -t -d tmp.XXX) d2=$(mktemp -t -d tmp.XXX) pid=-1 cleanup() { [ $pid -ne -1 ] && kill -9 $pid umount -l $d || true umount -l $d2 || true rm -rf $d $d2 } cmdline=$(realpath $0) dirname=$(dirname ${cmdline}) trap cleanup EXIT HUP INT TERM lxcfs="{{LXCFS_BUILD_ROOT}}/lxcfs" $d & pid=$! # put ourselves into x1 cgm movepidabs freezer / 1 cgm create freezer x1 cgm movepid freezer x1 1 mount -t cgroup -o freezer freezer $d2 sudo rmdir $d2/${UUID}_a1/${UUID}_a2 || true sudo rmdir $d2/${UUID}_a1 || true echo "Making sure root cannot mkdir" bad=0 mkdir $d/cgroup/freezer/${UUID}_a1 && bad=1 if [ "${bad}" -eq 1 ]; then false fi echo "Making sure root cannot rmdir" mkdir $d2/${UUID}_a1 mkdir $d2/${UUID}_a1/${UUID}_a2 rmdir $d/cgroup/freezer/${UUID}_a1 && bad=1 if [ "${bad}" -eq 1 ]; then false fi [ -d $d2/${UUID}_a1 ] rmdir $d/cgroup/freezer/${UUID}_a1/${UUID}_a2 && bad=1 if [ "${bad}" -eq 1 ]; then false fi [ -d $d2/${UUID}_a1/${UUID}_a2 ] echo "Making sure root cannot read/write" sleep 200 & p=$! echo $p > $d/cgroup/freezer/${UUID}_a1/tasks && bad=1 if [ "${bad}" -eq 1 ]; then false fi cat $d/cgroup/freezer/${UUID}_a1/tasks && bad=1 if [ "${bad}" -eq 1 ]; then false fi echo $p > $d/cgroup/freezer/${UUID}_a1/${UUID}_a2/tasks && bad=1 if [ "${bad}" -eq 1 ]; then false fi cat $d/cgroup/freezer/${UUID}_a1/${UUID}_a2/tasks && bad=1 if [ "${bad}" -eq 1 ]; then false fi # make sure things like truncate and access don't leak info about # the /${UUID}_a1 cgroup which we shouldn't be able to reach echo "Testing other system calls" ${dirname}/test-syscalls $d/cgroup/freezer/${UUID}_a1 ${dirname}/test-syscalls $d/cgroup/freezer/${UUID}_a1/${UUID}_a2 echo "Making sure root can act on descendents" mycg=$(cgm getpidcgroupabs freezer 1) newcg=${mycg}/${UUID}_a1 rmdir $d2/$newcg || true # cleanup previosu run mkdir $d/cgroup/freezer/$newcg echo $p > $d/cgroup/freezer/$newcg/tasks cat $d/cgroup/freezer/$newcg/tasks kill -9 $p while [ `wc -l $d/cgroup/freezer/$newcg/tasks | awk '{ print $1 }'` -ne 0 ]; do sleep 1 done rmdir $d/cgroup/freezer/$newcg echo "All tests passed!" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_meminfo_hierarchy.sh.in0000775000175000017500000000311414773561567021767 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x LXCFSDIR=${LXCFSDIR:-/var/lib/lxcfs} cg1=$(uuidgen).$$ cg2=$(uuidgen).$$ cleanup() { if [ $FAILED -eq 1 ]; then exit 1 fi exit 0 } FAILED=1 trap cleanup EXIT HUP INT TERM IS_CGROUP_V2=0 grep -qF 'cgroup cgroup' /proc/1/mountinfo || IS_CGROUP_V2=1 if [ "$IS_CGROUP_V2" = "1" ]; then [ ! -d /sys/fs/cgroup ] && exit 0 echo "==> Setting up cgroup" mempath=/sys/fs/cgroup/ memory_limit_file=memory.max tasks_file=cgroup.procs else [ ! -d /sys/fs/cgroup/memory ] && exit 0 echo "==> Setting up memory cgroup" initmemory=$(awk -F: '/memory/ { print $3 }' /proc/1/cgroup) mempath=/sys/fs/cgroup/memory/${initmemory} memory_limit_file=memory.limit_in_bytes tasks_file=tasks fi rmdir ${mempath}/${cg1} 2>/dev/null || true rmdir ${mempath}/${cg2} 2>/dev/null || true echo "==> Testing /proc/meminfo with limit" mkdir ${mempath}/${cg1} echo 500000000 > ${mempath}/${cg1}/${memory_limit_file} echo 1 > ${mempath}/${cg1}/${tasks_file} m1=$(awk '/^MemTotal:/ { print $2 }' ${LXCFSDIR}/proc/meminfo) if [ "$IS_CGROUP_V2" = "1" ]; then # temporary move to the root cgroup because of # "no internal process" constraint echo 1 > ${mempath}/${tasks_file} echo '+memory' > ${mempath}/${cg1}/cgroup.subtree_control fi echo "==> Testing /proc/meminfo with sub-cgroup" mkdir ${mempath}/${cg1}/${cg2} echo 1 > ${mempath}/${cg1}/${cg2}/${tasks_file} m2=$(awk '/^MemTotal:/ { print $2 }' ${LXCFSDIR}/proc/meminfo) echo "==> Confirming same limits" [ $m1 -eq $m2 ] FAILED=0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_proc.in0000775000175000017500000000524514773561567016640 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x PASS=0 cleanup() { [ "$PASS" = "1" ] || (echo FAIL && exit 1) } trap cleanup EXIT HUP INT TERM LXCFSDIR=${LXCFSDIR:-/var/lib/lxcfs} if ! mountpoint -q ${LXCFSDIR}; then echo "lxcfs isn't mounted on ${LXCFSDIR}" exit 1 fi IS_CGROUP_V2=0 grep -qF 'cgroup cgroup' /proc/1/mountinfo || IS_CGROUP_V2=1 if [ "$IS_CGROUP_V2" = "1" ]; then echo "==> Setting up cgroup in lxcfs_test_proc" [ ! -d /sys/fs/cgroup ] && exit 0 default_hierarchy="/" hierarchy_path=/sys/fs/cgroup/${default_hierarchy} cpupath=${hierarchy_path} mempath=${hierarchy_path} rmdir ${hierarchy_path}/lxcfs_test_proc 2>/dev/null || true mkdir ${hierarchy_path}/lxcfs_test_proc echo 1 > ${hierarchy_path}/lxcfs_test_proc/cgroup.procs echo '+cpu +cpuset +memory' > ${hierarchy_path}/cgroup.subtree_control memory_limit_file=memory.max else echo "==> Setting up memory/cpuset cgroup in lxcfs_test_proc" [ ! -d /sys/fs/cgroup/memory ] && exit 0 [ ! -d /sys/fs/cgroup/cpuset ] && exit 0 initcpuset=$(awk -F: '/cpuset/ { print $3 }' /proc/1/cgroup) initmemory=$(awk -F: '/memory/ { print $3 }' /proc/1/cgroup) cpupath=/sys/fs/cgroup/cpuset/${initcpuset} mempath=/sys/fs/cgroup/memory/${initmemory} rmdir ${cpupath}/lxcfs_test_proc 2>/dev/null || true rmdir ${mempath}/lxcfs_test_proc 2>/dev/null || true mkdir ${cpupath}/lxcfs_test_proc mkdir ${mempath}/lxcfs_test_proc echo 1 > ${cpupath}/lxcfs_test_proc/tasks echo 1 > ${mempath}/lxcfs_test_proc/tasks memory_limit_file=memory.limit_in_bytes fi echo $((64*1024*1024)) > ${mempath}/lxcfs_test_proc/${memory_limit_file} echo 0 > ${cpupath}/lxcfs_test_proc/cpuset.cpus # Test that readdir on /proc basically works echo "==> Testing directory listing on /proc" ls -l ${LXCFSDIR}/proc | grep uptime ls -l ${LXCFSDIR}/proc | grep cpuinfo ls -l ${LXCFSDIR}/proc | grep stat ls -l ${LXCFSDIR}/proc | grep meminfo # Test uptime echo "==> Testing /proc/uptime" grep -Eq "^0.[0-9]{2} 0.[0-9]{2}$" ${LXCFSDIR}/proc/uptime # Test cpuinfo echo "==> Testing /proc/cpuinfo" [ "$(grep "^processor" ${LXCFSDIR}/proc/cpuinfo | wc -l)" = "1" ] grep -q "^processor.*0$" ${LXCFSDIR}/proc/cpuinfo || grep -q "^processor 0:.*" ${LXCFSDIR}/proc/cpuinfo echo "==> Testing /sys/devices/system/cpu/online" [ "$(cat ${LXCFSDIR}/sys/devices/system/cpu/online)" = "$(cat ${cpupath}/lxcfs_test_proc/cpuset.cpus)" ] # Test stat echo "==> Testing /proc/stat" [ "$(grep "^cpu" ${LXCFSDIR}/proc/stat | wc -l)" = "2" ] # Test meminfo echo "==> Testing /proc/meminfo" grep -q "^MemTotal.*65536 kB$" ${LXCFSDIR}/proc/meminfo PASS=1 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_read_proc.sh.in0000775000175000017500000000100014773561567020225 0ustar00stgraberstgraber#/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x DIR=${LXCFSDIR:-/var/lib/lxcfs} if ! mountpoint -q $DIR; then echo "lxcfs isn't mounted on ${DIR}" exit 1 fi echo "==> Testing /proc/cpuinfo" {{LXCFS_BUILD_ROOT}}/tests/test-read $DIR/proc/cpuinfo 3 >/dev/null echo "==> Testing /proc/stat" {{LXCFS_BUILD_ROOT}}/tests/test-read $DIR/proc/stat 3 >/dev/null echo "==> Testing /proc/meminfo" {{LXCFS_BUILD_ROOT}}/tests/test-read $DIR/proc/meminfo 3 >/dev/null exit 0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_readdir.in0000775000175000017500000000107514773561567017304 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x PASS=0 UUID=$(uuidgen) cleanup() { [ "$PASS" = "1" ] || (echo FAIL && exit 1) } LXCFSDIR=${LXCFSDIR:-/var/lib/lxcfs} trap cleanup EXIT HUP INT TERM if ! mountpoint -q ${LXCFSDIR}; then echo "lxcfs isn't mounted on ${LXCFSDIR}" exit 1 fi TESTCASE="Stress readdir" echo "==> Checking for cpuset cgroups" if [ -d /sys/fs/cgroup/cpuset ]; then for i in `seq 1 1000`;do ls -al "${LXCFSDIR}/cgroup/cpuset" >/dev/null; done else echo "==> Skipping $TESTCASE" fi PASS=1 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_reload.sh.in0000775000175000017500000000277614773561567017562 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x [ $(id -u) -eq 0 ] cmdline=$(realpath $0) topdir={{LXCFS_BUILD_ROOT}} testdir=`mktemp -t -d libs.XXX` installdir=`mktemp -t -d libs.XXX` pidfile=$(mktemp) libdir=${installdir}/{{LIBDIR}} bindir=${installdir}/usr/bin lxcfspid=-1 FAILED=1 cleanup() { if [ ${lxcfspid} -ne -1 ]; then kill -9 ${lxcfspid} count=1 while [ -d ${testdir}/proc -a $count -lt 5 ]; do sleep 1 done umount -l ${testdir} fi rm -rf ${testdir} ${installdir} rm -f /tmp/lxcfs-iwashere rm -f ${pidfile} if [ ${FAILED} -eq 1 ]; then echo "liblxcfs.so reload test FAILED" else echo "liblxcfs.so reload test PASSED" fi } trap cleanup EXIT HUP INT TERM echo "==> Installing lxcfs to temporary path" cd {{LXCFS_BUILD_ROOT}} DESTDIR=${installdir} meson install if [ -n "${LD_LIBRARY_PATH:-}" ]; then export LD_LIBRARY_PATH="${libdir}:${LD_LIBRARY_PATH}" else export LD_LIBRARY_PATH=${libdir} fi echo "==> Spawning lxcfs" ${bindir}/lxcfs -p ${pidfile} ${testdir} & lxcfspid=$! count=1 while [ ! -d ${testdir}/proc ]; do [ $count -lt 5 ] sleep 1 count=$((count+1)) done rm -f /tmp/lxcfs-iwashere echo "==> Testing that lxcfs is functional" cat ${testdir}/proc/uptime [ ! -f /tmp/lxcfs-iwashere ] rm -f ${libdir}/liblxcfs.so* ${libdir}/liblxcfs.la cp {{LXCFS_BUILD_ROOT}}/liblxcfstest.so ${libdir}/liblxcfs.so kill -USR1 ${lxcfspid} sleep 1 cat ${testdir}/proc/uptime sleep 1 [ -f /tmp/lxcfs-iwashere ] FAILED=0 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_sigusr2.sh.in0000775000175000017500000000254414773561567017703 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1+ set -eu [ -n "${DEBUG:-}" ] && set -x PASS=0 cleanup() { [ "$PASS" = "1" ] || (echo FAIL && exit 1) } trap cleanup EXIT HUP INT TERM LXCFSDIR=${LXCFSDIR:-/var/lib/lxcfs} if ! mountpoint -q ${LXCFSDIR}; then echo "lxcfs isn't mounted on ${LXCFSDIR}" exit 1 fi IS_CGROUP_V2=0 grep -qF 'cgroup cgroup' /proc/1/mountinfo || IS_CGROUP_V2=1 if [ "$IS_CGROUP_V2" = "1" ]; then echo "==> Setting up cgroup in lxcfs_test_proc" [ ! -d /sys/fs/cgroup ] && exit 0 mempath=/sys/fs/cgroup rmdir ${mempath}/lxcfs_test_proc 2>/dev/null || true mkdir ${mempath}/lxcfs_test_proc memory_limit_file=memory.max tasks_file=cgroup.procs else echo "==> Setting up memory cgroup in lxcfs_test_proc" [ ! -d /sys/fs/cgroup/memory ] && exit 0 initmemory=$(awk -F: '/memory/ { print $3 }' /proc/1/cgroup) mempath=/sys/fs/cgroup/memory/${initmemory} rmdir ${mempath}/lxcfs_test_proc 2>/dev/null || true mkdir ${mempath}/lxcfs_test_proc memory_limit_file=memory.limit_in_bytes tasks_file=tasks fi echo 1 > ${mempath}/lxcfs_test_proc/${tasks_file} echo $((64*1024*1024)) > ${mempath}/lxcfs_test_proc/${memory_limit_file} # Test meminfo echo "==> Testing /proc/meminfo" [ "$(grep "^MemTotal:.*kB$" ${LXCFSDIR}/proc/meminfo)" = "$(grep "^MemTotal:.*kB$" /proc/meminfo)" ] PASS=1 ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tests/test_syscalls.c0000664000175000017500000002420614773561567017341 0ustar00stgraberstgraber/* SPDX-License-Identifier: LGPL-2.1+ */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void test_open(const char *path) { int fd = open(path, O_RDONLY); if (fd >= 0) { fprintf(stderr, "leak at open of %s\n", path); exit(1); } if (errno != ENOENT) { fprintf(stderr, "leak at open of %s: errno was %d\n", path, errno); exit(1); } } void test_stat(const char *path) { struct stat sb; if (stat(path, &sb) >= 0) { fprintf(stderr, "leak at stat of %s\n", path); exit(1); } if (errno != ENOENT) { fprintf(stderr, "leak at stat of %s: errno was %d\n", path, errno); exit(1); } } void test_access(const char *path) { if (access(path, O_RDONLY) >= 0) { fprintf(stderr, "leak at access of %s\n", path); exit(1); } if (errno != ENOENT) { fprintf(stderr, "leak at access of %s: errno was %d\n", path, errno); exit(1); } } void test_bind(const char *path) { int sfd; struct sockaddr_un my_addr; sfd = socket(AF_UNIX, SOCK_STREAM, 0); if (sfd < 0) { fprintf(stderr, "Failed to open a socket for bind test\n"); exit(1); } memset(&my_addr, 0, sizeof(struct sockaddr_un)); my_addr.sun_family = AF_UNIX; strncpy(my_addr.sun_path, path, sizeof(my_addr.sun_path) - 1); if (bind(sfd, (struct sockaddr *) &my_addr, sizeof(struct sockaddr_un)) != -1) { fprintf(stderr, "leak at bind of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at bind of %s: errno was %s\n", path, strerror(errno)); exit(1); } close(sfd); } void test_bindmount(const char *path) { if (mount(path, path, "none", MS_BIND, NULL) == 0) { fprintf(stderr, "leak at bind mount of %s\n", path); exit(1); } } void test_truncate(const char *path) { if (truncate(path, 0) == 0) { fprintf(stderr, "leak at truncate of %s\n", path); exit(1); } } void test_chdir(const char *path) { if (chdir(path) == 0) { fprintf(stderr, "leak at chdir to %s\n", path); exit(1); } } void test_rename(const char *path) { char *d = strdupa(path), *tmpname; d = dirname(d); size_t len = strlen(path) + 30; tmpname = alloca(len); snprintf(tmpname, len, "%s/%d", d, (int)getpid()); if (rename(path, tmpname) == 0 || errno != ENOENT) { fprintf(stderr, "leak at rename of %s\n", path); exit(1); } } void test_mkdir(const char *path) { size_t len = strlen(path) + 30; char *tmpname = alloca(len); snprintf(tmpname, len, "%s/%d", path, (int)getpid()); if (mkdir(path, 0755) == 0) { fprintf(stderr, "leak at mkdir of %s\n", path); exit(1); } if (errno != ENOENT) { fprintf(stderr, "leak at mkdir of %s, errno was %s\n", path, strerror(errno)); exit(1); } if (mkdir(tmpname, 0755) == 0) { fprintf(stderr, "leak at mkdir of %s\n", tmpname); exit(1); } if (errno != ENOENT) { fprintf(stderr, "leak at mkdir of %s, errno was %s\n", path, strerror(errno)); exit(1); } } void test_rmdir(const char *path) { size_t len = strlen(path) + 30; char *tmpname = alloca(len); snprintf(tmpname, len, "%s/%d", path, (int)getpid()); if (rmdir(path) == 0 || errno != ENOENT) { fprintf(stderr, "leak at rmdir of %s\n", path); exit(1); } if (rmdir(tmpname) == 0 || errno != ENOENT) { fprintf(stderr, "leak at rmdir of %s\n", tmpname); exit(1); } } void test_creat(const char *path) { if (creat(path, 0755) >= 0) { fprintf(stderr, "leak at creat of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at creat of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_link(const char *path) { char *d = strdupa(path), *tmpname; d = dirname(d); size_t len = strlen(path) + 30; tmpname = alloca(len); snprintf(tmpname, len, "%s/%d", d, (int)getpid()); if (link(path, tmpname) == 0) { fprintf(stderr, "leak at link of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at link of %s: errno was %s\n", path, strerror(errno)); exit(1); } if (link(tmpname, path) == 0) { fprintf(stderr, "leak at link (2) of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at link (2) of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_unlink(const char *path) { if (unlink(path) == 0) { fprintf(stderr, "leak at unlink of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at unlink of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_symlink(const char *path) { char *d = strdupa(path), *tmpname; d = dirname(d); size_t len = strlen(path) + 30; tmpname = alloca(len); snprintf(tmpname, len, "%s/%d", d, (int)getpid()); if (symlink(tmpname, path) == 0) { fprintf(stderr, "leak at symlink of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at symlink of %s: errno was %s\n", path, strerror(errno)); exit(1); } if (symlink(path, tmpname) == 0) { fprintf(stderr, "leak at symlink (2) of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at symlink (2) of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_readlink(const char *path) { char *dest = alloca(2 * strlen(path)); if (readlink(path, dest, 2 * strlen(path)) >= 0) { fprintf(stderr, "leak at readlink of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at readlink of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_chmod(const char *path) { if (chmod(path, 0755) == 0) { fprintf(stderr, "leak at chmod of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at chmod of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_chown(const char *path) { if (chown(path, 0, 0) == 0) { fprintf(stderr, "leak at chown of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at chown of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_lchown(const char *path) { if (lchown(path, 0, 0) == 0) { fprintf(stderr, "leak at lchown of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at lchown of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_mknod(const char *path) { if (mknod(path, 0755, makedev(0, 0)) == 0) { fprintf(stderr, "leak at mknod of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at mknod of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_chroot(const char *path) { if (chroot(path) == 0) { fprintf(stderr, "leak at chroot of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at chroot of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_xattrs(const char *path) { /* * might consider doing all of: * setxattr * lsetxattr * getxattr * lgetxattr * listxattr * llistxattr * removexattr * lremovexattr */ char value[200]; if (getxattr(path, "security.selinux", value, 200) >= 0) { fprintf(stderr, "leak at getxattr of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at getxattr of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_utimes(const char *path) { struct utimbuf times; times.actime = 0; times.modtime = 0; if (utime(path, ×) == 0) { fprintf(stderr, "leak at utime of %s\n", path); exit(1); } if (errno != ENOENT && errno != ENOSYS) { fprintf(stderr, "leak at utime of %s: errno was %s\n", path, strerror(errno)); exit(1); } } void test_openat(const char *path) { char *d = strdupa(path), *f, *tmpname; int fd, fd2; f = basename(d); d = dirname(d); fd = open(d, O_RDONLY); if (fd < 0) { fprintf(stderr, "Error in openat test: could not open parent dir\n"); fprintf(stderr, "(this is expected on the second run)\n"); return; } fd2 = openat(fd, f, O_RDONLY); if (fd2 >= 0 || errno != ENOENT) { fprintf(stderr, "leak at openat of %s\n", f); exit(1); } size_t len = strlen(path) + strlen("/cgroup.procs") + 1; tmpname = alloca(len); snprintf(tmpname, len, "%s/cgroup.procs", f); fd2 = openat(fd, tmpname, O_RDONLY); if (fd2 >= 0 || errno != ENOENT) { fprintf(stderr, "leak at openat of %s\n", tmpname); exit(1); } close(fd); } int main(int argc, char *argv[]) { char *procspath; size_t len; if (geteuid() != 0) { fprintf(stderr, "Run me as root\n"); exit(1); } if (argc != 2) { fprintf(stderr, "Usage: %s [lxcfs_test_cgroup_path]\n", argv[0]); exit(1); } /* Try syscalls on the directory and on $directory/cgroup.procs */ len = strlen(argv[1]) + strlen("/cgroup.procs") + 1; procspath = alloca(len); snprintf(procspath, len, "%s/cgroup.procs", argv[1]); test_open(argv[1]); test_open(procspath); test_stat(argv[1]); test_stat(procspath); test_access(argv[1]); test_access(procspath); test_bind(argv[1]); test_bind(procspath); test_bindmount(argv[1]); test_bindmount(procspath); test_truncate(argv[1]); test_truncate(procspath); test_chdir(argv[1]); test_chdir(procspath); test_rename(argv[1]); test_rename(procspath); test_mkdir(argv[1]); test_mkdir(procspath); test_rmdir(argv[1]); test_rmdir(procspath); test_creat(argv[1]); test_creat(procspath); test_link(argv[1]); test_link(procspath); test_unlink(argv[1]); test_unlink(procspath); test_symlink(argv[1]); test_symlink(procspath); test_readlink(argv[1]); test_readlink(procspath); test_chmod(argv[1]); test_chmod(procspath); test_chown(argv[1]); test_chown(procspath); test_lchown(argv[1]); test_lchown(procspath); test_mknod(argv[1]); test_mknod(procspath); test_chroot(argv[1]); test_chroot(procspath); test_xattrs(argv[1]); test_xattrs(procspath); test_utimes(argv[1]); test_utimes(procspath); test_openat(argv[1]); // meh... linkat etc? printf("All tests passed\n"); return 0; } ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tools/0000775000175000017500000000000014773561567014273 5ustar00stgraberstgraber././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tools/meson-build.sh0000775000175000017500000000063214773561567017051 0ustar00stgraberstgraber#!/bin/sh # SPDX-License-Identifier: LGPL-2.1-or-later set -eux src="$1" dst="$2" target="$3" options="$4" CC="$5" CXX="$6" [ -f "$dst/ninja.build" ] || CC="$CC" CXX="$CXX" meson "$src" "$dst" $options # Locate ninja binary, on CentOS 7 it is called ninja-build, so # use that name if available. ninja=ninja if which ninja-build >/dev/null 2>&1 ; then ninja=ninja-build fi "$ninja" -C "$dst" "$target" ././@PaxHeader0000000000000000000000000000002600000000000010213 xustar0022 mtime=1743709047.0 lxcfs-6.0.4/tools/meson-render-jinja2.py0000775000175000017500000000172414773561567020425 0ustar00stgraberstgraber#!/usr/bin/env python3 # SPDX-License-Identifier: LGPL-2.1-or-later import ast import os import re import sys import jinja2 def parse_config_h(filename): # Parse config.h file generated by meson. ans = {} for line in open(filename): m = re.match(r'#define\s+(\w+)\s+(.*)', line) if not m: continue a, b = m.groups() if b and b[0] in '0123456789"': b = ast.literal_eval(b) ans[a] = b return ans def render(filename, defines): text = open(filename).read() template = jinja2.Template( text, trim_blocks=True, keep_trailing_newline=True, undefined=jinja2.StrictUndefined, ) return template.render(defines) if __name__ == '__main__': defines = parse_config_h(sys.argv[1]) output = render(sys.argv[2], defines) with open(sys.argv[3], 'w') as f: f.write(output) info = os.stat(sys.argv[2]) os.chmod(sys.argv[3], info.st_mode)