File Views

The Concept

File views is a high level approach to ensure that processes refrain from accessing unintentional parts of a file. A file view consists of an elementary data type and a file type. The elementary type does not have to be of a basic type like MPI_INT, but it is of the type the file is made up of. The file type is a per process description of what it can access. The limitation on the file handle is set with MPI_File_set_view() based on the elementary and file types.

The Code

The program takes a file name as input argument. A 512x512x512 array of type int is divided among the processes which writes it to file using MPI IO. The data is then read back in and the output and input buffers are compared to verify that they are the same.

$ cd mpi_io/04_file_views/
$ cat src/darray.c
/******************************************************************************
 *                                                                            *
 *  MPI IO Example - File Views                                               *
 *                                                                            *
 *  File views are used to restrict where each process can access the         *
 *  input file.                                                               *
 *                                                                            *
 ******************************************************************************
 *                                                                            *
 *  The original code was written by Gustav at University of Indiana in 2003. *
 *                                                                            *
 *  The current version has been tested/updated by the HPC department at      *
 *  the Norwegian University of Science and Technology in 2011.               *
 *                                                                            *
 ******************************************************************************/
#include      /* printf and relatives live here */
#include     /* exit lives here */
#include     /* getopt lives here */
#include     /* strlen lives here */
#include  /* chmod needs these two */
#include   
#include         /* all MPI stuff lives here (including MPI-IO) */
#include 
  
#define MASTER_RANK 0
#define TRUE 1
#define FALSE 0
#define BOOLEAN int
#define MBYTE 1048576
#define NDIMS  3
#define SIZE 512
#define SYNOPSIS printf ("synopsis: %s -f \n", argv[0])
 
int main(argc, argv)
     int argc;
     char *argv[];
{
  /* my variables */
 
  int my_rank, pool_size, i, ndims, order, file_name_length,
    array_of_gsizes[NDIMS], array_of_distribs[NDIMS],
    array_of_dargs[NDIMS], array_of_psizes[NDIMS],
    *write_buffer, write_buffer_size, count,
    *read_buffer, read_buffer_size;
  BOOLEAN i_am_the_master = FALSE, input_error = FALSE,
    file_open_error = FALSE, file_write_error = FALSE, verbose = FALSE,
    my_read_error = FALSE, read_error = FALSE;
  char *file_name = NULL, message[BUFSIZ];
 
  /* MPI variables */
 
  MPI_Offset file_size;
  MPI_File fh;
  MPI_Status status;
  MPI_Datatype file_type;
  MPI_Aint file_type_extent;
  int file_type_size;
  int error_string_length;
  char error_string[BUFSIZ];
 
  /* getopt variables */
 
  extern char *optarg;
  int c;
 
  /* error handling variables */
 
  extern int errno;
 
  /* ACTION */
 
  MPI_Init(&argc,&argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &pool_size);
  if (my_rank == MASTER_RANK) i_am_the_master = TRUE;
 
  if (i_am_the_master) {
    while ((c = getopt(argc, argv, "f:vh")) != EOF)
      switch(c) {
      case 'f':
        file_name = optarg;
        break;
      case 'v':
        verbose = TRUE;
        break;
      case 'h':
        input_error = TRUE;
        break;
      case '?':
        input_error = TRUE;
        break;
      }
    if (file_name == NULL) input_error = TRUE;
    if (input_error)
      SYNOPSIS;
    else {
      file_name_length = strlen(file_name) + 1;
      if (verbose) {
        printf("file_name         = %s\n", file_name);
        printf("file_name_length  = %d\n", file_name_length);
      }
    }
  }
 
  MPI_Bcast(&input_error, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
 
  if (! input_error) {
    MPI_Bcast(&verbose, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
    MPI_Bcast(&file_name_length, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
    if (! i_am_the_master) file_name = (char*) malloc(file_name_length);
    MPI_Bcast(file_name, file_name_length, MPI_CHAR, MASTER_RANK,
              MPI_COMM_WORLD);
      
    /* Prepare for calling MPI_Type_create_darray */
 
    ndims = NDIMS;
    for (i = 0; i < ndims; i++) {
      array_of_gsizes[i] = SIZE;
      array_of_distribs[i] = MPI_DISTRIBUTE_BLOCK;
      array_of_dargs[i] = MPI_DISTRIBUTE_DFLT_DARG;
      array_of_psizes[i] = 0;
    }
    MPI_Dims_create(pool_size, ndims, array_of_psizes);
    order = MPI_ORDER_C;
 
    /* Now call MPI_Type_create_darray */
 
    if (verbose) {
      printf ("%3d: calling MPI_Type_create_darray with\n", my_rank);
      printf ("%3d:    pool_size         = %d\n", my_rank, pool_size);
      printf ("%3d:    my_rank           = %d\n", my_rank, my_rank);
      printf ("%3d:    ndims             = %d\n", my_rank, ndims);
      printf ("%3d:    array_of_gsizes   = (%d, %d, %d)\n", my_rank,
              array_of_gsizes[0],
              array_of_gsizes[1],
              array_of_gsizes[2]);
      printf ("%3d:    array_of_distribs = (%d, %d, %d)\n", my_rank,
              array_of_distribs[0],
              array_of_distribs[1],
              array_of_distribs[2]);
      printf ("%3d:    array_of_dargs    = (%d, %d, %d)\n", my_rank,
              array_of_dargs[0],
              array_of_dargs[1],
              array_of_dargs[2]);
      printf ("%3d:    array_of_psizes   = (%d, %d, %d)\n", my_rank,
              array_of_psizes[0],
              array_of_psizes[1],
              array_of_psizes[2]);
      printf ("%3d:    order             = %d\n", my_rank, order);
    }
    MPI_Type_create_darray(pool_size, my_rank, ndims,
                           array_of_gsizes, array_of_distribs,
                           array_of_dargs, array_of_psizes, order,
                           MPI_INT, &file_type);
    MPI_Type_commit(&file_type);
 
    /* Explore the returned type */
 
    MPI_Type_extent(file_type, &file_type_extent);
    MPI_Type_size(file_type, &file_type_size);
    if (verbose) {
      printf("%3d: file_type_size   = %d\n", my_rank, file_type_size);
      printf("%3d: file_type_extent = %d\n", my_rank, (int)file_type_extent);
    }
 
    /* Allocate space for your own write buffer based on the
       return of the MPI_Type_size call. */
 
    write_buffer_size = file_type_size / sizeof(int);
    write_buffer = (int*) malloc(write_buffer_size * sizeof(int));
 
    /* We do this in case sizeof(int) does not divide file_type_size
       exactly. But this should not happen if we have called
       MPI_Type_create_darray with MPI_INT as the original data
       type. */
 
    if (! write_buffer) {
      sprintf(message, "%3d: malloc write_buffer", my_rank);
      perror(message);
      MPI_Abort(MPI_COMM_WORLD, errno);
 
      /* We can still abort, because we have not opened any
         files yet. Notice that since MPI_Type_create_darray
         will fail if SIZE^3 * sizeof(int) exceeds MAX_INT,
         because MPI_Aint on AVIDD is a 32-bit integer,
         we are rather unlikely to fail on this malloc
         anyway. */
    }
 
    MPI_Barrier(MPI_COMM_WORLD);
    /* We wait here in case some procs have problems with malloc. */
 
    /* Initialize the buffer */
 
    for (i = 0; i < write_buffer_size; i++)
      *(write_buffer + i) = my_rank * SIZE + i;
 
    file_open_error = MPI_File_open(MPI_COMM_WORLD, file_name,
                                    MPI_MODE_CREATE | MPI_MODE_WRONLY,
                                    MPI_INFO_NULL, &fh);
    if (file_open_error != MPI_SUCCESS) {
      MPI_Error_string(file_open_error, error_string,
                       &error_string_length);
      fprintf(stderr, "%3d: %s\n", my_rank, error_string);
      MPI_Abort(MPI_COMM_WORLD, file_open_error);
 
      /* It is still OK to abort, because we have failed to
         open the file. */
 
    }
    else {
 
      if (i_am_the_master)
         chmod(file_name, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
      MPI_Barrier(MPI_COMM_WORLD);
 
      /* We should be able to change permissions on the file by setting
         the "file_perm" hint in the info structure before passing
         it to MPI_File_open, but MPICH2 doesn't support this
         yet. All processes have to meet on the barrier before
         further action. */
 
      MPI_File_set_view(fh, 0, MPI_INT, file_type, "native", MPI_INFO_NULL);
      file_write_error =
        MPI_File_write_all(fh, write_buffer, write_buffer_size, MPI_INT,
                           &status);
      if (file_write_error != MPI_SUCCESS) {
        MPI_Error_string(file_write_error, error_string,
                         &error_string_length);
        fprintf(stderr, "%3d: %s\n", my_rank, error_string);
        MPI_File_close(&fh);
        free(write_buffer);
        if (i_am_the_master) MPI_File_delete(file_name, MPI_INFO_NULL);
      }
      else {
        MPI_Get_count(&status, MPI_INT, &count);
        MPI_File_get_size(fh, &file_size);
        if(verbose) {
          printf("%3d: wrote %d integers\n", my_rank, count);
          printf("%3d: file size is %lld bytes\n", my_rank, file_size);
        }
        MPI_File_close(&fh);
 
        /* We have managed to open, write on it and close the file.
           Now we're going to read it the same way we wrote it. */
 
        read_buffer_size = write_buffer_size;
        read_buffer = (int*) malloc(read_buffer_size * sizeof(int));
        if (! read_buffer) {
          sprintf(message, "%3d: malloc read_buffer", my_rank);
          perror(message);
          MPI_Abort(MPI_COMM_WORLD, errno);
 
          /* We can abort, because the file has been closed and
             we haven't opened it for reading yet. */
 
        }
 
        MPI_Barrier(MPI_COMM_WORLD);
        /* We wait here in case some procs have problems with malloc. */
 
        MPI_File_open(MPI_COMM_WORLD, file_name, MPI_MODE_RDONLY,
                      MPI_INFO_NULL, &fh);
 
        /* We don't check for errors here, because we've just closed
           this file a moment ago, so it should still be there. */
 
        MPI_File_set_view(fh, 0, MPI_INT, file_type, "native", MPI_INFO_NULL);
        MPI_File_read_all(fh, read_buffer, read_buffer_size, MPI_INT, &status);
        MPI_Get_count(&status, MPI_INT, &count);
        if (verbose)
           printf("%3d: read %d integers\n", my_rank, count);
        MPI_File_close(&fh);
 
        /* Now check that the integers read are the same as the ones
           we wrote. */
 
        for (i = 0; i < read_buffer_size; i++) {
           if (*(write_buffer + i) != *(read_buffer + i)) {
              printf("%3d: data read different from data written, i = %d\n",
                     my_rank, i);
              my_read_error = TRUE;
           }
        }
         
        MPI_Reduce (&my_read_error, &read_error, 1, MPI_INT, MPI_LOR,
                    MASTER_RANK, MPI_COMM_WORLD);           
 
        if (i_am_the_master)
           if (! read_error)
              printf("--> All data read back is correct.\n");
 
      } /* no problem with file write */
    } /* no problem with file open */
  } /* no input error */
 
  MPI_Finalize();
  exit(0);
}
Previous Instructions

MPI_Offset file_size; Offset in MPI file.

MPI_File fh; MPI file handle.

MPI_Status status; Used to store status of MPI_File_read() and MPI_File_write().

MPI_Datatype file_type; Custom MPI datatype.

MPI_Init() and MPI_Finalize(); Used to initialize and finalize the MPI program.

MPI_Comm_rank() and MPI_Comm_size(); Used to find the rank of a process and the total number of processes.

MPI_Bcast(); Used to broadcast information from the master process.

MPI_Abort(); Used to abort the MPI program.

MPI_Barrier(); Used to synchronize the processes.

MPI_Error_string(); Used to get the error string to the related MPI error.

MPI_File_open(); Open MPI file.

MPI_File_close(&fh); Close MPI file.

MPI_Get_count(); Used to get the actual number of values written and read.

MPI_File_get_size(); Used to get the file size of an MPI file.

MPI_Reduce(); Used with MPI_LOR to check if any of the processes experienced read error.

MPI_Type_commit(); Used to commit a type before it can be used.

New Instructions

MPI_Dims_create( pool_size, ndims, array_of_psizes ); Divide pool_size nodes between ndim dimensions and place the resulting number of nodes in each dimension in array_of_psizes.

MPI_Type_create_darray( pool_size, my_rank, ndims, array_of_gsizes, array_of_distribs, array_of_dargs, array_of_psizes, order, MPI_INT, &file_type ); Create the data type file_type that maps an ndims dimensional array from MPI_INTs onto a ndims dimensional grid of processes. array_of_gsizes is the number of elements of the old type in each dimension. array_of_distribs is the type of distribution in each dimension, which is MPI_DISTRIBUTE_BLOCK. array_of_dargs is the distribution argument in each dimension, which is set to default. array_of_psizes is the number of processes in each dimension. order is set to MPI_ORDER_C, which is the row-major ordering used by C arrays.

MPI_Aint file_type_extent; Declaration of file_type_extent.

MPI_Type_extent( file_type, &file_type_extent ); Get the extent of file_type, which is the global file size.

MPI_Type_size( file_type, &file_type_size ); Get the size of file_type, which is the local file size.

MPI_File_set_view( fh, 0, MPI_INT, file_type, “native”, MPI_INFO_NULL ); Set the file view of fh, 0 displacement, the elementary type is MPI_INT. The data representation is the “native”, i.e. the file data layout is the same in memory. No special access pattern is required as indicated by MPI_INFO_NULL.

MPI_File_write_all( fh, write_buffer, write_buffer_size, MPI_INT, &status ); Is a collective write that blocks until all the processes in the communicator has finished.

Compile & Run

If you have not already done so, obtain all the example code here.

Switch to the Intel compiler (optional, only necessary once in each terminal session)

$ module load intel

Compile the program using

$ make

 Submit the job to the queue

$ make submit

The output files from the program execution are placed in the output folder

$ ls output/
131847.vilje.hpc.ntnu.no.ER  131847.vilje.hpc.ntnu.no.OU  file.bin

The standard out is placed in the .OU file

$ cat output/*OU
--> All data read back is correct.