Reading from MPI Files

The Concept

Multiple processes can read from the same file in parallel.

The Code

The program takes a single input argument – a file name. Each process reads a part of the file into a local buffer. The program aborts on input errors or read errors.

$ cd mpi_io/02_reading_mpi_files/
$ cat src/xrandpfile.c
 *                                                                            *
 *  MPI IO Example - Reading from MPI Files                                   *
 *                                                                            *
 *  Each of the processes read a specified number of blocks from a            *
 *  single input file.                                                        *
 *                                                                            *
 *                                                                            *
 *  The original code was written by Gustav at University of Indiana in 2003. *
 *                                                                            *
 *  The current version has been tested/updated by the HPC department at      *
 *  the Norwegian University of Science and Technology in 2011.               *
 *                                                                            *
#include    /* all IO stuff lives here */
#include   /* exit lives here */
#include   /* getopt lives here */
#include   /* strcpy lives here */
#include   /* INT_MAX lives here */
#include      /* MPI and MPI-IO live here */
#define MASTER_RANK 0
#define TRUE 1
#define FALSE 0
#define BOOLEAN int
#define MBYTE 1048576
#define SYNOPSIS printf ("synopsis: %s -f \n", argv[0])
int main(argc, argv)
     int argc;
     char *argv[];
  /* my variables */
  int my_rank, pool_size, last_guy, i, count;
  BOOLEAN i_am_the_master = FALSE, input_error = FALSE;
  char *filename = NULL, *read_buffer;
  int filename_length;
  int *junk;
  int file_open_error, number_of_bytes;
  /* MPI_Offset is long long */
  MPI_Offset my_offset, my_current_offset, total_number_of_bytes,
    number_of_bytes_ll, max_number_of_bytes_ll;
  MPI_File fh;
  MPI_Status status;
  double start, finish, io_time, longest_io_time;
  /* getopt variables */
  extern char *optarg;
  int c;
  /* ACTION */
  MPI_Init(&argc, &argv);
  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
  MPI_Comm_size(MPI_COMM_WORLD, &pool_size);
  last_guy = pool_size - 1;
  if (my_rank == MASTER_RANK) i_am_the_master = TRUE;
  if (i_am_the_master) {
    /* read the command line */
    while ((c = getopt(argc, argv, "f:h")) != EOF)
      switch(c) {
      case 'f':
    filename = optarg;
#ifdef DEBUG
    printf("input file: %s\n", filename);
      case 'h':
    input_error = TRUE;
      case '?':
    input_error = TRUE;
      } /* end of switch(c) */
    /* Check if the command line has initialized filename and
     * number_of_blocks.
    if (filename == NULL) {
      input_error = TRUE;
    if (input_error) MPI_Abort(MPI_COMM_WORLD, 1);
    filename_length = strlen(filename) + 1;
    /* This is another way of exiting, but it can be done only
       if no files have been opened yet. */
  } /* end of "if (i_am_the_master)"; reading the command line */
    /* If we got this far, the data read from the command line
       should be OK. */
  MPI_Bcast(&filename_length, 1, MPI_INT, MASTER_RANK, MPI_COMM_WORLD);
  if (! i_am_the_master) filename = (char*) malloc(filename_length);
#ifdef DEBUG
  printf("%3d: allocated space for filename\n", my_rank);
  MPI_Bcast(filename, filename_length, MPI_CHAR, MASTER_RANK, MPI_COMM_WORLD);
#ifdef DEBUG
  printf("%3d: received broadcast\n", my_rank);
  printf("%3d: filename = %s\n", my_rank, filename);
  /* Default I/O error handling is MPI_ERRORS_RETURN */
  file_open_error = MPI_File_open(MPI_COMM_WORLD, filename,
                          MPI_MODE_RDONLY, MPI_INFO_NULL, &fh);
  if (file_open_error != MPI_SUCCESS) {
    char error_string[BUFSIZ];
    int length_of_error_string, error_class;
    MPI_Error_class(file_open_error, &error_class);
    MPI_Error_string(error_class, error_string, &length_of_error_string);
    printf("%3d: %s\n", my_rank, error_string);
    MPI_Error_string(file_open_error, error_string, &length_of_error_string);
    printf("%3d: %s\n", my_rank, error_string);
    MPI_Abort(MPI_COMM_WORLD, file_open_error);
  MPI_File_get_size(fh, &total_number_of_bytes);
#ifdef DEBUG
  printf("%3d: total_number_of_bytes = %lld\n", my_rank, total_number_of_bytes);
  number_of_bytes_ll = total_number_of_bytes / pool_size;
  /* If pool_size does not divide total_number_of_bytes evenly,
     the last process will have to read more data, i.e., to the
     end of the file. */
  max_number_of_bytes_ll =
    number_of_bytes_ll + total_number_of_bytes % pool_size;
  if (max_number_of_bytes_ll < INT_MAX) {
    if (my_rank == last_guy)
      number_of_bytes = (int) max_number_of_bytes_ll;
      number_of_bytes = (int) number_of_bytes_ll;
    read_buffer = (char*) malloc(number_of_bytes);
#ifdef DEBUG
    printf("%3d: allocated %d bytes\n", my_rank, number_of_bytes);
    my_offset = (MPI_Offset) my_rank * number_of_bytes_ll;
#ifdef DEBUG
    printf("%3d: my offset = %lld\n", my_rank, my_offset);
    MPI_File_seek(fh, my_offset, MPI_SEEK_SET);
    start = MPI_Wtime();
    MPI_File_read(fh, read_buffer, number_of_bytes, MPI_BYTE, &status);
    finish = MPI_Wtime();
    MPI_Get_count(&status, MPI_BYTE, &count);
#ifdef DEBUG
    printf("%3d: read %d bytes\n", my_rank, count);
    MPI_File_get_position(fh, &my_offset);
#ifdef DEBUG
    printf("%3d: my offset = %lld\n", my_rank, my_offset);
    io_time = finish - start;
    MPI_Allreduce(&io_time, &longest_io_time, 1, MPI_DOUBLE, MPI_MAX,
    if (i_am_the_master) {
      printf("longest_io_time       = %f seconds\n", longest_io_time);
      printf("total_number_of_bytes = %lld\n", total_number_of_bytes);
      printf("transfer rate         = %f MB/s\n",
         total_number_of_bytes / longest_io_time / MBYTE);
  else {
    if (i_am_the_master) {
      printf("Not enough memory to read the file.\n");
      printf("Consider running on more nodes.\n");
  } /* of if(max_number_of_bytes_ll < INT_MAX) */
Previous instructions

MPI_Init() and MPI_Finalize(); Used to initialize and finalize the MPI program.

MPI_Comm_rank() and MPI_Comm_size(); Used to find the rank of a process and the total number of processes.

MPI_Offset my_offset; Variables to store offsets in the MPI file.

MPI_Wtime(); Used to find the local io time.

MPI_Status status; Store status of MPI_File_read().

MPI_File fh; MPI file handle.

MPI_Bcast(); Used to broadcast information from the master to the other processes.

MPI_Barrier(); Used to synchronize the processes

MPI_Abort(); Abort the program.

MPI_File_open(); Open MPI file.

MPI_File_seek(); Seek in MPI file.

MPI_File_get_position(); Get the current position in MPI file.

MPI_Allreduce(); Used with MPI_MAX to find the maximum io time of all the processes.

MPI_File_Close(); Close an MPI file.

New instructions

MPI_Error_class( file_open_error, &error_class ); Extracts the error_class from the MPI error file_open_error.

MPI_Error_string( error_class, error_string, &length_of_error_string ); Places the error string associated with error_class in error_string.

MPI_File_get_size( fh, &total_number_of_bytes ); Get the size of the MPI file fh.

MPI_File_read( fh, read_buffer, number_of_bytes, MPI_BYTE, &status ); Read number_of_bytes MPI_BYTE from fh and place the result in read_buffer.

Compile & Run

Note: This example will attempt to read the output from the mpi_io/01_writing_mpi_files example. It is therefore necessary to perform make submit on that project first and wait for it to complete before doing a make submit on this project.

If you have not already done so, obtain all the example code here.

Switch to the Intel compiler (optional, only necessary once in each terminal session)

$ module load intel

Compile the program using

$ make

Submit the job to the queue

$ make submit

The output files from the program execution are placed in the output folder

$ cat output/*
longest_io_time       = 2.896448 seconds
total_number_of_bytes = 469762048
transfer rate         = 154.672211 MB/s