0
votes

We are trying to learn how to build a container-like application on Linux (I'm using Ubuntu 20.04).

We've fetched an Ubuntu file system with the following command:

sudo debootstrap --variant=minbase bionic /home/some/path/ubuntu_fs http://ftp.heanet.ie/pub/ubuntu/

Here's what we are doing at the moment:

#include <bits/stdc++.h>
#include <unistd.h>
#include <sched.h>
#include <sys/wait.h>
#include <sys/mount.h>

using namespace std;


#define STACK_SIZE (1024 * 1024)
#define CHROOT_PATH "/home/some/path/ubuntu_fs"

struct ChildArgs{
    char ** argv;
};



int child(void* args)
{
    sethostname("conman", 6);   
    chroot(CHROOT_PATH);
    chdir("/");

    ChildArgs* child_args = static_cast<ChildArgs*>(args);

    // Removing the `if` block immediately below makes the program run fine
    if(mount("", "/", "", MS_PRIVATE | MS_REC, NULL))
    {
        perror("Mount failed\n");
        return errno;
    }

    if(mount("proc", "/proc", "proc", 0, NULL))
    {
        perror("Mount failed\n");
        return errno;
    }

    pid_t fork_pid = fork();
    if(fork_pid < 0)
        return EXIT_FAILURE;

    else if(fork_pid == 0)
    {

        execvp(child_args->argv[0], child_args->argv);
    }

    else
    {
        waitpid(fork_pid, NULL, 0);
        umount("/proc");
        umount("/");
    }
    return 0;
}

int main(int argc, char *argv[])
{
    static char child_stack[STACK_SIZE];

    int flags = CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWPID | SIGCHLD;

    ChildArgs child_args {&argv[1]};
    pid_t childpid = clone(child, child_stack + STACK_SIZE, flags, &child_args);

    if(waitpid(childpid, NULL, 0) == -1)
    {   
        perror("Error");
        exit(EXIT_FAILURE);
    }

    return 0;
}

The reason we want to mount the chroot directory as private is that we do not want mounts in the child mount namespace to propagate to any other namespace. If we do not do this, any mounts in the new namespace inherit the MS_SHARED mount type from the parent mount, and are hence visible in the parent namespace. This SO answer also recommends doing the same.

However, executing our code returns the following error:

Mount failed: Invalid argument

Changing the MS_PRIVATE | MS_REC to 0 returns the following:

Mount failed: No such device

Only mounting /proc and /sys in the chroot directory works fine, but are visible from the parent namespaces (as expected).

Any suggestions on how to make the mounts private would be very helpful.

1
Have you tried NULL (or 0) as third parameter of mount() instead of the empty string "" ? - Rachid K.
It would be helpful if you presented a minimal reproducible example. That shouldn't be too hard if the code fragments presented already capture all the important details. - John Bollinger
@RachidK. Yep, we've tried both NULL and 0, but it doesn't seem to make any difference. - Akash Vaish
@JohnBollinger Provided a reproducible piece of code now, thanks. - Akash Vaish
Side note: You're passing child_stack + STACK_SIZE to clone as the child stack pointer. This arg should point to the last byte of the available stack. But, you're pointing to one byte beyond that (i.e. UB [undefined behavior]). I think you want child_stack + STACK_SIZE - 1. From man clone: so child_stack usually points to the topmost address of the memory space set up for the child stack. If it were me, I'd use malloc instead of "putting a stack within a stack"--YMMV - Craig Estey

1 Answers

-1
votes

I guess that the directory CHROOT_PATH for which you call mount() is not a mount point. Hence the error.

I reworked your program in C language to do almost the same. But the mount(MS_PRIVATE | MS_REC) operation is done on the '/' mount point in the new namespaces before chroot() not after. The program builds a reduced file system on the fly with some bind mounts to get access to the host executables/libraries.

#define _GNU_SOURCE
#include <unistd.h>
#include <sched.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sched.h>
#include <sys/stat.h>

#define STACK_SIZE (1024 * 1024)
#define CHROOT_PATH "/tmp/try"

#define ERR(msg, ...) fprintf(stderr, "Line#%d - " msg, __LINE__, ##__VA_ARGS__)

struct bind_mount_points_t {
  const char *source;
  const char *target;
  int mount;
} bind_mount_points[] = {

  { "/bin", CHROOT_PATH "/bin", 1 },
  { "/lib", CHROOT_PATH "/lib", 1 },
  { "/lib64", CHROOT_PATH "/lib64", 1 },
  { "/usr", CHROOT_PATH "/usr", 1 },
  { "/etc", CHROOT_PATH "/etc", 1 },
  { "/tmp", CHROOT_PATH "/tmp", 0 },

  { 0, 0 }

};


int child(void *args)
{
  char **av;
  int rc;
  int i;

  rc = sethostname("conman", 6);
  if (rc != 0) {
    ERR("sethostname(): '%m' (%d)\n", errno);
    return errno;
  }

  av = (char **)args;

  if(mount("none", "/", 0, MS_PRIVATE | MS_REC, NULL)) {
    ERR("mount(): '%m' (%d)\n", errno);
    return errno;
  }

  // Make the file system with some bind mounts to get access to executables
  i = 0;
  while(bind_mount_points[i].source) {

    rc = mkdir(bind_mount_points[i].target, 0777);
    if (rc != 0) {
      ERR("mkdir(%s): '%m' (%d)\n", bind_mount_points[i].target, errno);
      return errno;
    }

    if (bind_mount_points[i].mount) {
      if(mount(bind_mount_points[i].source, bind_mount_points[i].target, 0, MS_BIND|MS_REC, NULL)) {
        ERR("mount(%s): '%m' (%d)\n", bind_mount_points[i].target, errno);
        return errno;
      }
    }

    i ++;
  }

  // Change root
  rc = chroot(CHROOT_PATH);
  if (rc != 0) {
    ERR("chroot(): '%m' (%d)\n", errno);
    return errno;
  }

  rc = chdir("/");
  if (rc != 0) {
    ERR("chroot(/): '%m' (%d)\n", errno);
    return errno;
  }

  rc = mkdir("/proc", 0777);
  if (rc != 0) {
    ERR("mkdir(/proc): '%m' (%d)\n", errno);
    return errno;
  }

  if(mount("proc", "/proc", "proc", 0, NULL)) {
    ERR("mount(proc): '%m' (%d)\n", errno);
    return errno;
  }

  pid_t fork_pid = fork();
  if(fork_pid < 0) {
    ERR("fork(): '%m' (%d)", errno);
    return EXIT_FAILURE;
  } else if(fork_pid == 0) {
    execv(av[0], av);
    _exit(2);
  } else {
    waitpid(fork_pid, NULL, 0);

    // Cleanup
    umount("/proc");
    rmdir("/proc");

    i = 0;
    while(bind_mount_points[i].source) {

      if (bind_mount_points[i].mount) {
        if(umount(bind_mount_points[i].source)) {
          ERR("umount(%s): '%m' (%d)\n", bind_mount_points[i].source, errno);
          return errno;
        }
      }

      rc = rmdir(bind_mount_points[i].source);
      if (rc != 0) {
        ERR("rmdir(%s): '%m' (%d)\n", bind_mount_points[i].source, errno);
        return errno;
      }

      i ++;
    }

  }
  return 0;
}

int main(int argc, char *argv[])
{
  static char child_stack[STACK_SIZE];

  int flags = CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWPID | SIGCHLD;

  pid_t childpid = clone(child, child_stack + STACK_SIZE, flags, &(argv[1]));

  if (childpid < 0) {
    ERR("clone(): '%m' (%d)", errno);
    exit(EXIT_FAILURE);
  }

  if(waitpid(childpid, NULL, 0) == -1) {   
    ERR("Error");
    exit(EXIT_FAILURE);
  }

  return 0;
}

Build of the program:

$ gcc ctr.c -o ctr

Make the directory on which the chroot() occurs:

$ mkdir /tmp/try

Run the program with a shell:

$ sudo ./ctr /bin/bash
root@conman:/# hostname
conman
root@conman:/# pwd
/
root@conman:/# ps -ef
UID          PID    PPID  C STIME TTY          TIME CMD
root           1       0  0 07:20 ?        00:00:00 ./ctr /bin/bash
root           2       1  0 07:20 ?        00:00:00 /bin/bash
root           4       2  0 07:20 ?        00:00:00 ps -ef
root@conman:/# ls -l /
total 108
drwxr-xr-x   2 root root 69632 mars   25 06:09 bin
drwxr-xr-x 152 root root 12288 mars   24 08:07 etc
drwxr-xr-x 152 root root 12288 mars   25 06:09 lib
drwxr-xr-x   2 root root  4096 févr.  4 10:02 lib64
dr-xr-xr-x 393 root root     0 mars   25 07:20 proc
drwxr-xr-x   2 root root  4096 mars   25 07:20 tmp
drwxr-xr-x  16 root root  4096 nov.   22 20:02 usr
root@conman:/# exit
exit

I modified the previous program to use the Ubuntu file system installed through debootstrap:

#define _GNU_SOURCE
#include <unistd.h>
#include <sched.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <sched.h>
#include <sys/stat.h>

#define STACK_SIZE (1024 * 1024)
#define CHROOT_PATH "/tmp/try"

#define ERR(msg, ...) fprintf(stderr, "Line#%d - " msg, __LINE__, ##__VA_ARGS__)

int child(void *args)
{
  char **av;
  int rc;

  rc = sethostname("conman", 6);
  if (rc != 0) {
    ERR("sethostname(): '%m' (%d)\n", errno);
    return errno;
  }

  av = (char **)args;

  if(mount("none", "/", 0, MS_PRIVATE | MS_REC, NULL)) {
      ERR("mount(): '%m' (%d)\n", errno);
      return errno;
  }

  // Change root
  rc = chroot(CHROOT_PATH);
  if (rc != 0) {
    ERR("chroot(): '%m' (%d)\n", errno);
    return errno;
  }

  rc = chdir("/");
  if (rc != 0) {
    ERR("chroot(/): '%m' (%d)\n", errno);
    return errno;
  }

  if(mount("proc", "/proc", "proc", 0, NULL)) {
    ERR("mount(proc): '%m' (%d)\n", errno);
    return errno;
  }

  pid_t fork_pid = fork();
  if(fork_pid < 0) {
    ERR("fork(): '%m' (%d)", errno);
    return EXIT_FAILURE;
  } else if(fork_pid == 0) {
    execv(av[0], av);
    _exit(2);
  } else {
    waitpid(fork_pid, NULL, 0);

    // Cleanup
    umount("/proc");
  }
  return 0;
}

int main(int argc, char *argv[])
{
  static char child_stack[STACK_SIZE];

  int flags = CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWPID | SIGCHLD;

  pid_t childpid = clone(child, child_stack + STACK_SIZE, flags, &(argv[1]));

  if (childpid < 0) {
    ERR("clone(): '%m' (%d)", errno);
    exit(EXIT_FAILURE);
  }

  if(waitpid(childpid, NULL, 0) == -1) {   
    ERR("Error");
    exit(EXIT_FAILURE);
  }

  return 0;
}

Installation of the file system (in /tmp/try), build of the program and execution:

$ sudo debootstrap --variant=minbase bionic /tmp/try http://ftp.heanet.ie/pub/ubuntu/
[...]
$ gcc ctr2.c -o ctr2
$ sudo ./ctr2 /bin/bash
root@conman:/# hostname       
conman
root@conman:/# pwd
/
root@conman:/# ls -la
total 80
drwxr-xr-x  21 root root 4096 Mar 25 06:40 .
drwxr-xr-x  21 root root 4096 Mar 25 06:40 ..
drwxr-xr-x   2 root root 4096 Mar 25 06:40 bin
drwxr-xr-x   2 root root 4096 Apr 24  2018 boot
drwxr-xr-x   4 root root 4096 Mar 25 06:40 dev
drwxr-xr-x  29 root root 4096 Mar 25 06:40 etc
drwxr-xr-x   2 root root 4096 Apr 24  2018 home
drwxr-xr-x   8 root root 4096 May 23  2017 lib
drwxr-xr-x   2 root root 4096 Mar 25 06:40 lib64
drwxr-xr-x   2 root root 4096 Mar 25 06:40 media
drwxr-xr-x   2 root root 4096 Mar 25 06:40 mnt
drwxr-xr-x   2 root root 4096 Mar 25 06:40 opt
dr-xr-xr-x 393 root root    0 Mar 25 06:46 proc
drwx------   2 root root 4096 Mar 25 06:40 root
drwxr-xr-x   4 root root 4096 Mar 25 06:40 run
drwxr-xr-x   2 root root 4096 Mar 25 06:40 sbin
drwxr-xr-x   2 root root 4096 Mar 25 06:40 srv
drwxr-xr-x   2 root root 4096 Apr 24  2018 sys
drwxrwxrwt   2 root root 4096 Mar 25 06:40 tmp
drwxr-xr-x  10 root root 4096 Mar 25 06:40 usr
drwxr-xr-x  11 root root 4096 Mar 25 06:40 var
root@conman:/# ps -ef
UID          PID    PPID  C STIME TTY          TIME CMD
root           1       0  0 06:46 ?        00:00:00 ./ctr2 /bin/bash
root           2       1  0 06:46 ?        00:00:00 /bin/bash
root           9       2  0 06:46 ?        00:00:00 ps -ef
root@conman:/# 

Mounting a USB key in this "guest" file system is not visible on host side. For example, on my system, the USB key is plugged on /dev/sdd1 (if there is an auto-mount on host, unmount it):

$ lsblk
[...]
sdd      8:48   1   7,5G  0 disk 
`-sdd1   8:49   1   7,5G  0 part 

I create those nodes in the "guest" file system and mount the key:

root@conman:/# df        
df: no file systems processed
root@conman:/# mknod /dev/sdd b 8 48
root@conman:/# mknod /dev/sdd1 b 8 49
root@conman:/# mount -t ntfs /dev/sdd1 /media
root@conman:/# df
Filesystem     1K-blocks   Used Available Use% Mounted on
/dev/sdd1        7830524 357764   7472760   5% /media

On host side, the key (mount point) is not visible.