/*************************************************************************** * * Copyright (c) 1998, 1999 Timpanogas Research Group, Inc. * 895 West Center Street * Orem, Utah 84057 * jmerkey@timpanogas.com * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation, version 2, or any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You are free to modify and re-distribute this program in accordance * with the terms specified in the GNU Public License. The copyright * contained in this code is required to be present in any derivative * works and you are required to provide the source code for this * program as part of any commercial or non-commercial distribution. * You are required to respect the rights of the Copyright holders * named within this code. * * jmerkey@timpanogas.com and TRG, Inc. are the official maintainers of * this code. You are encouraged to report any bugs, problems, fixes, * suggestions, and comments about this software to jmerkey@timpanogas.com * or linux-kernel@vger.rutgers.edu. New releases, patches, bug fixes, and * technical documentation can be found at www.timpanogas.com. TRG will * periodically post new releases of this software to www.timpanogas.com * that contain bug fixes and enhanced capabilities. * * Original Authorship : * source code written by Jeff V. Merkey, TRG, Inc. * * Original Contributors : * Jeff V. Merkey, TRG, Inc. * Darren Major, TRG, Inc. * Alan Cox, RedHat Software, Inc. * **************************************************************************** * * * AUTHOR : Jeff V. Merkey (jmerkey@timpanogas.com) * FILE : ASYNC.C * DESCRIP : FENRIS Asynch IO Library * DATE : July 13, 2000 (my birthday) * * ***************************************************************************/ #include "globals.h" // We use eight (8) processes for the async io manager. Each disk bin // is computed as disk % 8 and matched into one of eight bin groups. // On SMP versions of linux, this should allow us to keep as many // drive spindles active as possible at one time without eating too // many server processes to do this, provided the linux kernel // load balances these processes across processors. ASYNCH_IO *disk_io_head[8] = { 0, 0, 0, 0, 0, 0, 0 }; ASYNCH_IO *disk_io_tail[8] = { 0, 0, 0, 0, 0, 0, 0 }; ASYNCH_IO_HEAD asynch_io_head[MAX_DISKS]; #if (PROFILE_AIO) ULONG hash_hits = 0; ULONG hash_misses = 0; ULONG hash_fill = 0; ULONG hash_total = 0; ULONG probe_avg = 0; ULONG probe_max = 0; #endif #if (LINUX_SLEEP) NWFSInitMutex(disk_io_sem0); NWFSInitMutex(disk_io_sem1); NWFSInitMutex(disk_io_sem2); NWFSInitMutex(disk_io_sem3); NWFSInitMutex(disk_io_sem4); NWFSInitMutex(disk_io_sem5); NWFSInitMutex(disk_io_sem6); NWFSInitMutex(disk_io_sem7); NWFSInitMutex(asynch_head_lock); struct semaphore *io_sem_table[8]={ &disk_io_sem0, &disk_io_sem1, &disk_io_sem2, &disk_io_sem3, &disk_io_sem4, &disk_io_sem5, &disk_io_sem6, &disk_io_sem7 }; #endif extern void RunAsynchIOQueue(ULONG disk); void asynch_lock(void) { #if (LINUX_SLEEP) if (WaitOnSemaphore(&asynch_head_lock) == -EINTR) NWFSPrint("asynch lock was interrupted\n"); #endif } void asynch_unlock(void) { #if (LINUX_SLEEP) SignalSemaphore(&asynch_head_lock); #endif } ULONG hash_disk_io(ASYNCH_IO *io) { register int i = (io->disk % MAX_DISKS); register int j = ((io->sector_offset >> 8) & 0xF); ASYNCH_IO *old, *p; if (!asynch_io_head[i].hash_head[j]) { io->hnext = io->hprior = NULL; asynch_io_head[i].hash_head[j] = io; asynch_io_head[i].hash_tail[j] = io; return 0; } p = asynch_io_head[i].hash_head[j]; old = NULL; while (p) { if (p->disk != io->disk) { NWFSPrint("nwfs: io request has bad disk id (%d/%d)\n", (int)p->disk, (int)io->disk); return -1; } if (p->sector_offset < io->sector_offset) { old = p; p = p->hnext; } else { if (p->hprior) { p->hprior->hnext = io; io->hnext = p; io->hprior = p->hprior; p->hprior = io; return 0; } io->hnext = p; io->hprior = NULL; p->hprior = io; asynch_io_head[i].hash_head[j] = io; return 0; } } old->hnext = io; io->hnext = NULL; io->hprior = old; asynch_io_head[i].hash_tail[j] = io; return 0; } ULONG unhash_disk_io(ASYNCH_IO *io) { register int i = (io->disk % MAX_DISKS); register int j = ((io->sector_offset >> 8) & 0xF); if (asynch_io_head[i].hash_head[j] == io) { asynch_io_head[i].hash_head[j] = (void *) io->hnext; if (asynch_io_head[i].hash_head[j]) asynch_io_head[i].hash_head[j]->hprior = NULL; else asynch_io_head[i].hash_tail[j] = NULL; } else { io->hprior->hnext = io->hnext; if (io != asynch_io_head[i].hash_tail[j]) io->hnext->hprior = io->hprior; else asynch_io_head[i].hash_tail[j] = io->hprior; } io->hnext = io->hprior = 0; return 0; } void lock_io(ULONG disk) { #if (LINUX_SLEEP) if (WaitOnSemaphore(io_sem_table[(disk % 8)]) == -EINTR) NWFSPrint("lock io was interrupted\n"); #endif } void unlock_io(ULONG disk) { #if (LINUX_SLEEP) SignalSemaphore(io_sem_table[(disk % 8)]); #endif } void process_asynch_io(ULONG disk) { register int i = (disk % 8); register int r, j; ASYNCH_IO *list, *io; // continue to cycle through this list until we have completely // emptied the list. while (disk_io_head[i]) { // take the entire list, zero the head and tail and process the list // from this context. this will simulate an alternating A and B list // and avoid elevator starvation. We also need to clear all the // asynch hash list info for this run so folks don't attempt to // link new aio requests to this active aio chain. asynch_lock(); list = disk_io_head[i]; disk_io_head[i] = disk_io_tail[i] = 0; for (r=0; r < MAX_DISKS; r++) { if ((r % 8) == i) { for (j=0; j < 16; j++) { asynch_io_head[r].hash_head[j] = 0; asynch_io_head[r].hash_tail[j] = 0; } } } #if (PROFILE_AIO) if (hash_hits || hash_misses) { if (hash_total) probe_avg = (probe_avg / hash_total); else probe_avg = 0; // we seem to average 96% hit efficiency for locating an insertion // point in the aio list with an average of 1 probe per insert. The // other 4% involve cases where a single element is on the list, // and we only probe 1 time to find our insert point. NWFSPrint("hits-%d misses-%d fill-%d total-%d probe_avg-%d probe_max-%d\n", (int)hash_hits, (int)hash_misses, (int)hash_fill, (int)hash_total, (int)probe_avg, (int)probe_max); } hash_hits = hash_misses = hash_fill = hash_total = probe_avg = probe_max = 0; #endif asynch_unlock(); while (list) { io = list; list = list->next; io->next = io->prior = 0; if (io->signature != ASIO_SUBMIT_IO) { NWFSPrint("nwfs: asynch io process - bad io request cmd-%X s-%X\n", (unsigned)io->command, (unsigned)io->signature); io->ccode = ASIO_BAD_SIGNATURE; if (io->call_back_routine) (io->call_back_routine)(io); io->signature = 0; continue; } switch (io->command) { case ASYNCH_READ: io->return_code = pReadDiskSectors(io->disk, io->sector_offset, io->buffer, io->sector_count, io->sector_count); io->ccode = 0; if (!io->return_code) io->ccode = ASIO_IO_ERROR; if (io->call_back_routine) (io->call_back_routine)(io); io->signature = 0; break; case ASYNCH_WRITE: io->return_code = pWriteDiskSectors(io->disk, io->sector_offset, io->buffer, io->sector_count, io->sector_count); io->ccode = 0; if (!io->return_code) io->ccode = ASIO_IO_ERROR; if (io->call_back_routine) (io->call_back_routine)(io); io->signature = 0; break; case ASYNCH_FILL: io->return_code = pZeroFillDiskSectors(io->disk, io->sector_offset, io->sector_count, io->sector_count); io->ccode = 0; if (!io->return_code) io->ccode = ASIO_IO_ERROR; if (io->call_back_routine) (io->call_back_routine)(io); io->signature = 0; break; default: io->ccode = ASIO_BAD_COMMAND; if (io->call_back_routine) (io->call_back_routine)(io); io->signature = 0; break; } } } return; } ULONG asynch_io_pending(ULONG disk) { return ((disk_io_head[(disk % 8)]) ? 1 : 0); } ASYNCH_IO *remove_io(ULONG disk, ASYNCH_IO *io) { register int i = (disk % 8); if (disk_io_head[i] == io) { disk_io_head[i] = (void *) io->next; if (disk_io_head[i]) disk_io_head[i]->prior = NULL; else disk_io_tail[i] = NULL; } else { io->prior->next = io->next; if (io != disk_io_tail[i]) io->next->prior = io->prior; else disk_io_tail[i] = io->prior; } io->next = io->prior = 0; return io; } ULONG index_io(ULONG disk, ASYNCH_IO *io) { register int i = (disk % 8); register int r = (io->disk % MAX_DISKS); register int j = ((io->sector_offset >> 8) & 0xF); #if (PROFILE_AIO) register int count; #endif ASYNCH_IO *old, *p; #if (PROFILE_AIO) count = 1; #endif if (!disk_io_tail[i]) { io->next = io->prior = NULL; disk_io_head[i] = io; disk_io_tail[i] = io; #if (PROFILE_AIO) hash_fill++; hash_total++; if (count > probe_max) probe_max = count; probe_avg += count; #endif return 0; } if (asynch_io_head[r].hash_head[j]) { #if (PROFILE_AIO) hash_hits++; #endif p = asynch_io_head[r].hash_head[j]; } else { #if (PROFILE_AIO) hash_misses++; #endif p = disk_io_head[i]; } #if (PROFILE_AIO) hash_total++; #endif old = NULL; while (p) { if ((p->disk < io->disk) && (p->sector_offset < io->sector_offset)) { old = p; p = p->next; #if (PROFILE_AIO) count++; #endif } else { if (p->prior) { p->prior->next = io; io->next = p; io->prior = p->prior; p->prior = io; #if (PROFILE_AIO) if (count > probe_max) probe_max = count; probe_avg += count; #endif return 0; } io->next = p; io->prior = NULL; p->prior = io; disk_io_head[i] = io; #if (PROFILE_AIO) if (count > probe_max) probe_max = count; probe_avg += count; #endif return 0; } } old->next = io; io->next = NULL; io->prior = old; disk_io_tail[i] = io; #if (PROFILE_AIO) if (count > probe_max) probe_max = count; probe_avg += count; #endif return 0; } void insert_io(ULONG disk, ASYNCH_IO *io) { asynch_lock(); if (io->signature == ASIO_SUBMIT_IO) { NWFSPrint("nwfs: asynch io request already active\n"); asynch_unlock(); return; } index_io(disk, io); io->signature = ASIO_SUBMIT_IO; hash_disk_io(io); asynch_unlock(); return; }