/***************************************************************************
*
*   Copyright (c) 1998, 1999 Timpanogas Research Group, Inc.
*   895 West Center Street
*   Orem, Utah  84057
*   jmerkey@timpanogas.com
*
*   This program is free software; you can redistribute it and/or modify it
*   under the terms of the GNU General Public License as published by the
*   Free Software Foundation, version 2, or any later version.
*
*   This program is distributed in the hope that it will be useful, but
*   WITHOUT ANY WARRANTY; without even the implied warranty of
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
*   General Public License for more details.
*
*   You are free to modify and re-distribute this program in accordance
*   with the terms specified in the GNU Public License.  The copyright
*   contained in this code is required to be present in any derivative
*   works and you are required to provide the source code for this
*   program as part of any commercial or non-commercial distribution.
*   You are required to respect the rights of the Copyright holders
*   named within this code.
*
*   jmerkey@timpanogas.com and TRG, Inc. are the official maintainers of
*   this code.  You are encouraged to report any bugs, problems, fixes,
*   suggestions, and comments about this software to jmerkey@timpanogas.com
*   or linux-kernel@vger.rutgers.edu.  New releases, patches, bug fixes, and
*   technical documentation can be found at www.timpanogas.com.  TRG will
*   periodically post new releases of this software to www.timpanogas.com
*   that contain bug fixes and enhanced capabilities.
*
*   Original Authorship      :
*      source code written by Jeff V. Merkey, TRG, Inc.
*
*   Original Contributors    :
*      Jeff V. Merkey, TRG, Inc.
*      Darren Major, TRG, Inc.
*      Alan Cox, RedHat Software, Inc.
*
****************************************************************************
*
*
*   AUTHOR   :  Jeff V. Merkey (jmerkey@timpanogas.com)
*   FILE     :  ASYNC.C
*   DESCRIP  :  FENRIS Asynch IO Library
*   DATE     :  July 13, 2000 (my birthday)
*
*
***************************************************************************/

#include "globals.h"

// We use eight (8) processes for the async io manager.  Each disk bin
// is computed as disk % 8 and matched into one of eight bin groups.
// On SMP versions of linux, this should allow us to keep as many
// drive spindles active as possible at one time without eating too
// many server processes to do this, provided the linux kernel
// load balances these processes across processors.

ASYNCH_IO *disk_io_head[8] = { 0, 0, 0, 0, 0, 0, 0 };
ASYNCH_IO *disk_io_tail[8] = { 0, 0, 0, 0, 0, 0, 0 };
ASYNCH_IO_HEAD asynch_io_head[MAX_DISKS];

#if (PROFILE_AIO)
ULONG hash_hits = 0;
ULONG hash_misses = 0;
ULONG hash_fill = 0;
ULONG hash_total = 0;
ULONG probe_avg = 0;
ULONG probe_max = 0;
#endif

#if (LINUX_SLEEP)

NWFSInitMutex(disk_io_sem0);
NWFSInitMutex(disk_io_sem1);
NWFSInitMutex(disk_io_sem2);
NWFSInitMutex(disk_io_sem3);
NWFSInitMutex(disk_io_sem4);
NWFSInitMutex(disk_io_sem5);
NWFSInitMutex(disk_io_sem6);
NWFSInitMutex(disk_io_sem7);
NWFSInitMutex(asynch_head_lock);

struct semaphore *io_sem_table[8]={
    &disk_io_sem0, &disk_io_sem1, &disk_io_sem2, &disk_io_sem3,
    &disk_io_sem4, &disk_io_sem5, &disk_io_sem6, &disk_io_sem7
};

#endif

extern void RunAsynchIOQueue(ULONG disk);

void asynch_lock(void)
{
#if (LINUX_SLEEP)
    if (WaitOnSemaphore(&asynch_head_lock) == -EINTR)
       NWFSPrint("asynch lock was interrupted\n");
#endif
}

void asynch_unlock(void)
{
#if (LINUX_SLEEP)
    SignalSemaphore(&asynch_head_lock);
#endif
}

ULONG hash_disk_io(ASYNCH_IO *io)
{
    register int i = (io->disk % MAX_DISKS);
    register int j = ((io->sector_offset >> 8) & 0xF);
    ASYNCH_IO *old, *p;

    if (!asynch_io_head[i].hash_head[j])
    {
       io->hnext = io->hprior = NULL;
       asynch_io_head[i].hash_head[j] = io;
       asynch_io_head[i].hash_tail[j] = io;
       return 0;
    }

    p = asynch_io_head[i].hash_head[j];
    old = NULL;
    while (p)
    {
       if (p->disk != io->disk)
       {
          NWFSPrint("nwfs:  io request has bad disk id (%d/%d)\n",
                    (int)p->disk, (int)io->disk);
          return -1;
       }

       if (p->sector_offset < io->sector_offset)
       {
	  old = p;
	  p = p->hnext;
       }
       else
       {
	  if (p->hprior)
	  {
	     p->hprior->hnext = io;
	     io->hnext = p;
	     io->hprior = p->hprior;
	     p->hprior = io;
	     return 0;
	  }
	  io->hnext = p;
	  io->hprior = NULL;
	  p->hprior = io;
	  asynch_io_head[i].hash_head[j] = io;
	  return 0;
       }
    }
    old->hnext = io;
    io->hnext = NULL;
    io->hprior = old;
    asynch_io_head[i].hash_tail[j] = io;
    return 0;
}

ULONG unhash_disk_io(ASYNCH_IO *io)
{
    register int i = (io->disk % MAX_DISKS);
    register int j = ((io->sector_offset >> 8) & 0xF);

    if (asynch_io_head[i].hash_head[j] == io)
    {
       asynch_io_head[i].hash_head[j] = (void *) io->hnext;
       if (asynch_io_head[i].hash_head[j])
	  asynch_io_head[i].hash_head[j]->hprior = NULL;
       else
	  asynch_io_head[i].hash_tail[j] = NULL;
    }
    else
    {
       io->hprior->hnext = io->hnext;
       if (io != asynch_io_head[i].hash_tail[j])
	  io->hnext->hprior = io->hprior;
       else
	  asynch_io_head[i].hash_tail[j] = io->hprior;
    }
    io->hnext = io->hprior = 0;

    return 0;
}

void lock_io(ULONG disk)
{
#if (LINUX_SLEEP)
    if (WaitOnSemaphore(io_sem_table[(disk % 8)]) == -EINTR)
       NWFSPrint("lock io was interrupted\n");
#endif
}

void unlock_io(ULONG disk)
{
#if (LINUX_SLEEP)
    SignalSemaphore(io_sem_table[(disk % 8)]);
#endif
}

void process_asynch_io(ULONG disk)
{
   register int i = (disk % 8);
   register int r, j;
   ASYNCH_IO *list, *io;

   // continue to cycle through this list until we have completely
   // emptied the list.

   while (disk_io_head[i])
   {
      // take the entire list, zero the head and tail and process the list
      // from this context.  this will simulate an alternating A and B list
      // and avoid elevator starvation.  We also need to clear all the
      // asynch hash list info for this run so folks don't attempt to
      // link new aio requests to this active aio chain.

      asynch_lock();
      list = disk_io_head[i];
      disk_io_head[i] = disk_io_tail[i] = 0;
      for (r=0; r < MAX_DISKS; r++)
      {
         if ((r % 8) == i)
         {
            for (j=0; j < 16; j++)
            {
               asynch_io_head[r].hash_head[j] = 0;
               asynch_io_head[r].hash_tail[j] = 0;
            }
         }
      }

#if (PROFILE_AIO)
      if (hash_hits || hash_misses)
      {
         if (hash_total)
            probe_avg = (probe_avg / hash_total);
         else
            probe_avg = 0;

         // we seem to average 96% hit efficiency for locating an insertion
	 // point in the aio list with an average of 1 probe per insert.  The
         // other 4% involve cases where a single element is on the list,
         // and we only probe 1 time to find our insert point.

         NWFSPrint("hits-%d misses-%d fill-%d total-%d probe_avg-%d probe_max-%d\n",
	           (int)hash_hits, (int)hash_misses, (int)hash_fill,
		   (int)hash_total, (int)probe_avg, (int)probe_max);
      }
      hash_hits = hash_misses = hash_fill = hash_total = probe_avg = probe_max = 0;
#endif
      asynch_unlock();

      while (list)
      {
         io = list;
         list = list->next;
         io->next = io->prior = 0;

	 if (io->signature != ASIO_SUBMIT_IO)
         {
            NWFSPrint("nwfs:  asynch io process - bad io request cmd-%X s-%X\n",
	              (unsigned)io->command, (unsigned)io->signature);
            io->ccode = ASIO_BAD_SIGNATURE;
            if (io->call_back_routine)
               (io->call_back_routine)(io);
            io->signature = 0;
            continue;
         }

         switch (io->command)
         {
            case ASYNCH_READ:
               io->return_code = pReadDiskSectors(io->disk,
	                                          io->sector_offset,
                                                  io->buffer,
						  io->sector_count,
				                  io->sector_count);
               io->ccode = 0;
	       if (!io->return_code)
                  io->ccode = ASIO_IO_ERROR;
               if (io->call_back_routine)
                  (io->call_back_routine)(io);
               io->signature = 0;
               break;

            case ASYNCH_WRITE:
               io->return_code = pWriteDiskSectors(io->disk,
	                                           io->sector_offset,
                                                   io->buffer,
						   io->sector_count,
				                   io->sector_count);
               io->ccode = 0;
               if (!io->return_code)
                  io->ccode = ASIO_IO_ERROR;
               if (io->call_back_routine)
                  (io->call_back_routine)(io);
               io->signature = 0;
               break;

            case ASYNCH_FILL:
               io->return_code = pZeroFillDiskSectors(io->disk,
	                                              io->sector_offset,
						      io->sector_count,
				                      io->sector_count);
               io->ccode = 0;
               if (!io->return_code)
                  io->ccode = ASIO_IO_ERROR;
               if (io->call_back_routine)
                  (io->call_back_routine)(io);
               io->signature = 0;
               break;

            default:
               io->ccode = ASIO_BAD_COMMAND;
               if (io->call_back_routine)
                  (io->call_back_routine)(io);
               io->signature = 0;
               break;
	 }
      }
   }
   return;
}

ULONG asynch_io_pending(ULONG disk)
{
   return ((disk_io_head[(disk % 8)]) ? 1 : 0);
}

ASYNCH_IO *remove_io(ULONG disk, ASYNCH_IO *io)
{
    register int i = (disk % 8);

    if (disk_io_head[i] == io)
    {
       disk_io_head[i] = (void *) io->next;
       if (disk_io_head[i])
	  disk_io_head[i]->prior = NULL;
       else
	  disk_io_tail[i] = NULL;
    }
    else
    {
       io->prior->next = io->next;
       if (io != disk_io_tail[i])
	  io->next->prior = io->prior;
       else
	  disk_io_tail[i] = io->prior;
    }
    io->next = io->prior = 0;

    return io;

}

ULONG index_io(ULONG disk, ASYNCH_IO *io)
{
    register int i = (disk % 8);
    register int r = (io->disk % MAX_DISKS);
    register int j = ((io->sector_offset >> 8) & 0xF);
#if (PROFILE_AIO)
    register int count;
#endif
    ASYNCH_IO *old, *p;

#if (PROFILE_AIO)
    count = 1;
#endif
    if (!disk_io_tail[i])
    {
       io->next = io->prior = NULL;
       disk_io_head[i] = io;
       disk_io_tail[i] = io;
#if (PROFILE_AIO)
       hash_fill++;
       hash_total++;

       if (count > probe_max)
          probe_max = count;
       probe_avg += count;
#endif
       return 0;
    }

    if (asynch_io_head[r].hash_head[j])
    {
#if (PROFILE_AIO)
       hash_hits++;
#endif
       p = asynch_io_head[r].hash_head[j];
    }
    else
    {
#if (PROFILE_AIO)
       hash_misses++;
#endif
       p = disk_io_head[i];
    }
#if (PROFILE_AIO)
    hash_total++;
#endif

    old = NULL;
    while (p)
    {
       if ((p->disk < io->disk) && (p->sector_offset < io->sector_offset))
       {
	  old = p;
	  p = p->next;
#if (PROFILE_AIO)
          count++;
#endif
       }
       else
       {
	  if (p->prior)
	  {
	     p->prior->next = io;
	     io->next = p;
	     io->prior = p->prior;
	     p->prior = io;

#if (PROFILE_AIO)
             if (count > probe_max)
                probe_max = count;
             probe_avg += count;
#endif
	     return 0;
	  }
	  io->next = p;
	  io->prior = NULL;
	  p->prior = io;
	  disk_io_head[i] = io;

#if (PROFILE_AIO)
          if (count > probe_max)
             probe_max = count;
          probe_avg += count;
#endif
	  return 0;
       }
    }
    old->next = io;
    io->next = NULL;
    io->prior = old;
    disk_io_tail[i] = io;

#if (PROFILE_AIO)
    if (count > probe_max)
       probe_max = count;
    probe_avg += count;
#endif
    return 0;
}

void insert_io(ULONG disk, ASYNCH_IO *io)
{
    asynch_lock();
    if (io->signature == ASIO_SUBMIT_IO)
    {
       NWFSPrint("nwfs:  asynch io request already active\n");
       asynch_unlock();
       return;
    }
    index_io(disk, io);
    io->signature = ASIO_SUBMIT_IO;
    hash_disk_io(io);
    asynch_unlock();
    return;

}