Re: Linux 2.6.20.16

From: Willy Tarreau
Date: Thu Aug 16 2007 - 01:47:56 EST


diff --git a/Makefile b/Makefile
index 947ff3c..b3806cb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 6
SUBLEVEL = 20
-EXTRAVERSION = .15
+EXTRAVERSION = .16
NAME = Homicidal Dwarf Hamster

# *DOCUMENTATION*
diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 5e47683..9bf056e 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -367,10 +367,6 @@ ENTRY(system_call)
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
GET_THREAD_INFO(%ebp)
- testl $TF_MASK,PT_EFLAGS(%esp)
- jz no_singlestep
- orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
# system call tracing in operation / emulation
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
@@ -385,6 +381,10 @@ syscall_exit:
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
+ testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
+ jz no_singlestep
+ orl $_TIF_SINGLESTEP,TI_flags(%ebp)
+no_singlestep:
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx # current->work
jne syscall_exit_work
diff --git a/arch/i386/oprofile/nmi_int.c b/arch/i386/oprofile/nmi_int.c
index 3700eef..be4a9a8 100644
--- a/arch/i386/oprofile/nmi_int.c
+++ b/arch/i386/oprofile/nmi_int.c
@@ -131,7 +131,6 @@ static void nmi_save_registers(void * dummy)
{
int cpu = smp_processor_id();
struct op_msrs * msrs = &cpu_msrs[cpu];
- model->fill_in_addresses(msrs);
nmi_cpu_save_registers(msrs);
}

@@ -195,6 +194,7 @@ static struct notifier_block profile_exceptions_nb = {
static int nmi_setup(void)
{
int err=0;
+ int cpu;

if (!allocate_msrs())
return -ENOMEM;
@@ -207,6 +207,13 @@ static int nmi_setup(void)
/* We need to serialize save and setup for HT because the subset
* of msrs are distinct for save and setup operations
*/
+
+ /* Assume saved/restored counters are the same on all CPUs */
+ model->fill_in_addresses(&cpu_msrs[0]);
+ for_each_possible_cpu (cpu) {
+ if (cpu != 0)
+ cpu_msrs[cpu] = cpu_msrs[0];
+ }
on_each_cpu(nmi_save_registers, NULL, 0, 1);
on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
nmi_enabled = 1;
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index f72e8e8..a84304e 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -177,6 +177,13 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
*/
discard_lazy_cpu_state();

+ /*
+ * Force reload of FP/VEC.
+ * This has to be done before copying stuff into current->thread.fpr/vr
+ * for the reasons explained in the previous comment.
+ */
+ regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
+
err |= __copy_from_user(&current->thread.fpr, &sc->fp_regs, FP_REGS_SIZE);

#ifdef CONFIG_ALTIVEC
@@ -198,9 +205,6 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
current->thread.vrsave = 0;
#endif /* CONFIG_ALTIVEC */

- /* Force reload of FP/VEC */
- regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC);
-
return err;
}

diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 2968b90..e67cc4f 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -72,6 +72,8 @@ void show_mem(void)

for_each_online_pgdat(pgdat) {
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+ if (!pfn_valid(pgdat->node_start_pfn + i))
+ continue;
page = pfn_to_page(pgdat->node_start_pfn + i);
total++;
if (PageReserved(page))
@@ -766,3 +768,9 @@ int in_gate_area_no_task(unsigned long addr)
{
return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
}
+
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+ return __alloc_bootmem_core(pgdat->bdata, size,
+ SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0);
+}
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 3ffa080..e4e0ccb 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -1102,6 +1102,7 @@ static void cyy_intr_chip(struct cyclades_card *cinfo, int chip,

if (data & info->ignore_status_mask) {
info->icount.rx++;
+ spin_unlock(&cinfo->card_lock);
return;
}
if (tty_buffer_request_room(tty, 1)) {
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cef1287..550ac72 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -255,19 +255,25 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, unsigned long inde

}

-static int write_sb_page(mddev_t *mddev, long offset, struct page *page, int wait)
+static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
{
mdk_rdev_t *rdev;
struct list_head *tmp;
+ mddev_t *mddev = bitmap->mddev;

ITERATE_RDEV(mddev, rdev, tmp)
if (test_bit(In_sync, &rdev->flags)
- && !test_bit(Faulty, &rdev->flags))
+ && !test_bit(Faulty, &rdev->flags)) {
+ int size = PAGE_SIZE;
+ if (page->index == bitmap->file_pages-1)
+ size = roundup(bitmap->last_page_size,
+ bdev_hardsect_size(rdev->bdev));
md_super_write(mddev, rdev,
- (rdev->sb_offset<<1) + offset
+ (rdev->sb_offset<<1) + bitmap->offset
+ page->index * (PAGE_SIZE/512),
- PAGE_SIZE,
+ size,
page);
+ }

if (wait)
md_super_wait(mddev);
@@ -282,7 +288,7 @@ static int write_page(struct bitmap *bitmap, struct page *page, int wait)
struct buffer_head *bh;

if (bitmap->file == NULL)
- return write_sb_page(bitmap->mddev, bitmap->offset, page, wait);
+ return write_sb_page(bitmap, page, wait);

bh = page_buffers(page);

@@ -923,6 +929,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
}

bitmap->filemap[bitmap->file_pages++] = page;
+ bitmap->last_page_size = count;
}
paddr = kmap_atomic(page, KM_USER0);
if (bitmap->flags & BITMAP_HOSTENDIAN)
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 4c2471e..b9ff4e3 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -33,7 +33,6 @@
struct crypt_io {
struct dm_target *target;
struct bio *base_bio;
- struct bio *first_clone;
struct work_struct work;
atomic_t pending;
int error;
@@ -107,6 +106,8 @@ struct crypt_config {

static struct kmem_cache *_crypt_io_pool;

+static void clone_init(struct crypt_io *, struct bio *);
+
/*
* Different IV generation algorithms:
*
@@ -378,25 +379,20 @@ static int crypt_convert(struct crypt_config *cc,
* This should never violate the device limitations
* May return a smaller bio when running out of pages
*/
-static struct bio *
-crypt_alloc_buffer(struct crypt_config *cc, unsigned int size,
- struct bio *base_bio, unsigned int *bio_vec_idx)
+static struct bio *crypt_alloc_buffer(struct crypt_io *io, unsigned int size,
+ unsigned int *bio_vec_idx)
{
+ struct crypt_config *cc = io->target->private;
struct bio *clone;
unsigned int nr_iovecs = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
gfp_t gfp_mask = GFP_NOIO | __GFP_HIGHMEM;
unsigned int i;

- if (base_bio) {
- clone = bio_alloc_bioset(GFP_NOIO, base_bio->bi_max_vecs, cc->bs);
- __bio_clone(clone, base_bio);
- } else
- clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
-
+ clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, cc->bs);
if (!clone)
return NULL;

- clone->bi_destructor = dm_crypt_bio_destructor;
+ clone_init(io, clone);

/* if the last bio was not complete, continue where that one ended */
clone->bi_idx = *bio_vec_idx;
@@ -495,9 +491,6 @@ static void dec_pending(struct crypt_io *io, int error)
if (!atomic_dec_and_test(&io->pending))
return;

- if (io->first_clone)
- bio_put(io->first_clone);
-
bio_endio(io->base_bio, io->base_bio->bi_size, io->error);

mempool_free(io, cc->io_pool);
@@ -562,6 +555,7 @@ static void clone_init(struct crypt_io *io, struct bio *clone)
clone->bi_end_io = crypt_endio;
clone->bi_bdev = cc->dev->bdev;
clone->bi_rw = io->base_bio->bi_rw;
+ clone->bi_destructor = dm_crypt_bio_destructor;
}

static void process_read(struct crypt_io *io)
@@ -585,7 +579,6 @@ static void process_read(struct crypt_io *io)
}

clone_init(io, clone);
- clone->bi_destructor = dm_crypt_bio_destructor;
clone->bi_idx = 0;
clone->bi_vcnt = bio_segments(base_bio);
clone->bi_size = base_bio->bi_size;
@@ -615,8 +608,7 @@ static void process_write(struct crypt_io *io)
* so repeat the whole process until all the data can be handled.
*/
while (remaining) {
- clone = crypt_alloc_buffer(cc, base_bio->bi_size,
- io->first_clone, &bvec_idx);
+ clone = crypt_alloc_buffer(io, base_bio->bi_size, &bvec_idx);
if (unlikely(!clone)) {
dec_pending(io, -ENOMEM);
return;
@@ -631,31 +623,23 @@ static void process_write(struct crypt_io *io)
return;
}

- clone_init(io, clone);
clone->bi_sector = cc->start + sector;
-
- if (!io->first_clone) {
- /*
- * hold a reference to the first clone, because it
- * holds the bio_vec array and that can't be freed
- * before all other clones are released
- */
- bio_get(clone);
- io->first_clone = clone;
- }
-
remaining -= clone->bi_size;
sector += bio_sectors(clone);

- /* prevent bio_put of first_clone */
+ /* Grab another reference to the io struct
+ * before we kick off the request */
if (remaining)
atomic_inc(&io->pending);

generic_make_request(clone);

+ /* Do not reference clone after this - it
+ * may be gone already. */
+
/* out of memory -> run queues */
if (remaining)
- congestion_wait(bio_data_dir(clone), HZ/100);
+ congestion_wait(WRITE, HZ/100);
}
}

@@ -954,10 +938,12 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
struct crypt_config *cc = ti->private;
struct crypt_io *io;

+ if (bio_barrier(bio))
+ return -EOPNOTSUPP;
+
io = mempool_alloc(cc->io_pool, GFP_NOIO);
io->target = ti;
io->base_bio = bio;
- io->first_clone = NULL;
io->error = io->post_process = 0;
atomic_set(&io->pending, 0);
kcryptd_queue_io(io);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index dfe3214..2c404f7 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -415,7 +415,7 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)
raid0_conf_t *conf = mddev_to_conf(mddev);
struct strip_zone *zone;
mdk_rdev_t *tmp_dev;
- unsigned long chunk;
+ sector_t chunk;
sector_t block, rsect;
const int rw = bio_data_dir(bio);

@@ -470,7 +470,6 @@ static int raid0_make_request (request_queue_t *q, struct bio *bio)

sector_div(x, zone->nb_dev);
chunk = x;
- BUG_ON(x != (sector_t)chunk);

x = block >> chunksize_bits;
tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 97ee870..b20c6e9 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1235,17 +1235,24 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
}
r1_bio->read_disk = primary;
for (i=0; i<mddev->raid_disks; i++)
- if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
- test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
+ if (r1_bio->bios[i]->bi_end_io == end_sync_read) {
int j;
int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
- for (j = vcnt; j-- ; )
- if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
- page_address(sbio->bi_io_vec[j].bv_page),
- PAGE_SIZE))
- break;
+
+ if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
+ for (j = vcnt; j-- ; ) {
+ struct page *p, *s;
+ p = pbio->bi_io_vec[j].bv_page;
+ s = sbio->bi_io_vec[j].bv_page;
+ if (memcmp(page_address(p),
+ page_address(s),
+ PAGE_SIZE))
+ break;
+ }
+ } else
+ j = 0;
if (j >= 0)
mddev->resync_mismatches += r1_bio->sectors;
if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 82249a6..9eb66c1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1867,6 +1867,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
int d = r10_bio->devs[i].devnum;
bio = r10_bio->devs[i].bio;
bio->bi_end_io = NULL;
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
if (conf->mirrors[d].rdev == NULL ||
test_bit(Faulty, &conf->mirrors[d].rdev->flags))
continue;
@@ -2037,6 +2038,11 @@ static int run(mddev_t *mddev)
/* 'size' is now the number of chunks in the array */
/* calculate "used chunks per device" in 'stride' */
stride = size * conf->copies;
+
+ /* We need to round up when dividing by raid_disks to
+ * get the stride size.
+ */
+ stride += conf->raid_disks - 1;
sector_div(stride, conf->raid_disks);
mddev->size = stride << (conf->chunk_shift-1);

diff --git a/drivers/media/video/saa7134/saa7134-tvaudio.c b/drivers/media/video/saa7134/saa7134-tvaudio.c
index dd759d6..36b3fa3 100644
--- a/drivers/media/video/saa7134/saa7134-tvaudio.c
+++ b/drivers/media/video/saa7134/saa7134-tvaudio.c
@@ -1006,7 +1006,7 @@ int saa7134_tvaudio_init2(struct saa7134_dev *dev)
int saa7134_tvaudio_fini(struct saa7134_dev *dev)
{
/* shutdown tvaudio thread */
- if (dev->thread.pid >= 0) {
+ if (dev->thread.pid > 0) {
dev->thread.shutdown = 1;
wake_up_interruptible(&dev->thread.wq);
wait_for_completion(&dev->thread.exit);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index c6259c7..40bdcf9 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -1157,13 +1157,16 @@ e1000_probe(struct pci_dev *pdev,
!e1000_check_mng_mode(&adapter->hw))
e1000_get_hw_control(adapter);

- strcpy(netdev->name, "eth%d");
- if ((err = register_netdev(netdev)))
- goto err_register;
-
/* tell the stack to leave us alone until e1000_open() is called */
netif_carrier_off(netdev);
netif_stop_queue(netdev);
+#ifdef CONFIG_E1000_NAPI
+ netif_poll_disable(netdev);
+#endif
+
+ strcpy(netdev->name, "eth%d");
+ if ((err = register_netdev(netdev)))
+ goto err_register;

DPRINTK(PROBE, INFO, "Intel(R) PRO/1000 Network Connection\n");

diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 38e75cf..aec8c59 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -95,7 +95,7 @@ static int disable_msi = 0;
module_param(disable_msi, int, 0);
MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)");

-static int idle_timeout = 0;
+static int idle_timeout = 100;
module_param(idle_timeout, int, 0);
MODULE_PARM_DESC(idle_timeout, "Watchdog timer for lost interrupts (ms)");

@@ -2341,6 +2341,13 @@ static int sky2_poll(struct net_device *dev0, int *budget)

work_done = sky2_status_intr(hw, work_limit);
if (work_done < work_limit) {
+ /* Bug/Errata workaround?
+ * Need to kick the TX irq moderation timer.
+ */
+ if (sky2_read8(hw, STAT_TX_TIMER_CTRL) == TIM_START) {
+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_STOP);
+ sky2_write8(hw, STAT_TX_TIMER_CTRL, TIM_START);
+ }
netif_rx_complete(dev0);

sky2_read32(hw, B0_Y2_SP_LISR);
diff --git a/drivers/serial/mpsc.c b/drivers/serial/mpsc.c
index 3d2fcc5..64ed5ef 100644
--- a/drivers/serial/mpsc.c
+++ b/drivers/serial/mpsc.c
@@ -502,7 +502,8 @@ mpsc_sdma_intr_ack(struct mpsc_port_info *pi)

if (pi->mirror_regs)
pi->shared_regs->SDMA_INTR_CAUSE_m = 0;
- writel(0, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE);
+ writeb(0x00, pi->shared_regs->sdma_intr_base + SDMA_INTR_CAUSE +
+ pi->port.line);
return;
}

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 2275f27..8f820e4 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -59,6 +59,7 @@ extern void *__alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long align,
unsigned long goal,
unsigned long limit);
+extern void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size);

#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern void reserve_bootmem(unsigned long addr, unsigned long size);
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d37f46a..f768e10 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2235,11 +2235,11 @@
#define PCI_DEVICE_ID_INTEL_ICH8_5 0x283e
#define PCI_DEVICE_ID_INTEL_ICH8_6 0x2850
#define PCI_DEVICE_ID_INTEL_ICH9_0 0x2910
-#define PCI_DEVICE_ID_INTEL_ICH9_1 0x2911
+#define PCI_DEVICE_ID_INTEL_ICH9_1 0x2917
#define PCI_DEVICE_ID_INTEL_ICH9_2 0x2912
#define PCI_DEVICE_ID_INTEL_ICH9_3 0x2913
#define PCI_DEVICE_ID_INTEL_ICH9_4 0x2914
-#define PCI_DEVICE_ID_INTEL_ICH9_5 0x2915
+#define PCI_DEVICE_ID_INTEL_ICH9_5 0x2919
#define PCI_DEVICE_ID_INTEL_ICH9_6 0x2930
#define PCI_DEVICE_ID_INTEL_82855PM_HB 0x3340
#define PCI_DEVICE_ID_INTEL_82830_HB 0x3575
diff --git a/include/linux/raid/bitmap.h b/include/linux/raid/bitmap.h
index 6db9a4c..dd5a05d 100644
--- a/include/linux/raid/bitmap.h
+++ b/include/linux/raid/bitmap.h
@@ -232,6 +232,7 @@ struct bitmap {
struct page **filemap; /* list of cache pages for the file */
unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
unsigned long file_pages; /* number of pages in the file */
+ int last_page_size; /* bytes in the last page */

unsigned long flags;

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4463735..7a0cc67 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1137,6 +1137,7 @@ static inline void put_task_struct(struct task_struct *t)
/* Not implemented yet, only for 486*/
#define PF_STARTING 0x00000002 /* being created */
#define PF_EXITING 0x00000004 /* getting shut down */
+#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2a7b38d..1a76bda 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -162,7 +162,7 @@ extern struct workqueue_struct *__create_workqueue(const char *name,
int singlethread,
int freezeable);
#define create_workqueue(name) __create_workqueue((name), 0, 0)
-#define create_freezeable_workqueue(name) __create_workqueue((name), 0, 1)
+#define create_freezeable_workqueue(name) __create_workqueue((name), 1, 1)
#define create_singlethread_workqueue(name) __create_workqueue((name), 1, 0)

extern void destroy_workqueue(struct workqueue_struct *wq);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 9c8c232..5a75657 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -905,7 +905,7 @@ static void audit_update_watch(struct audit_parent *parent,

/* If the update involves invalidating rules, do the inode-based
* filtering now, so we don't omit records. */
- if (invalidating &&
+ if (invalidating && current->audit_context &&
audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
audit_set_auditable(current->audit_context);

diff --git a/kernel/exit.c b/kernel/exit.c
index fec12eb..d306845 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -883,13 +883,29 @@ fastcall NORET_TYPE void do_exit(long code)
if (unlikely(tsk->flags & PF_EXITING)) {
printk(KERN_ALERT
"Fixing recursive fault but reboot is needed!\n");
+ /*
+ * We can do this unlocked here. The futex code uses
+ * this flag just to verify whether the pi state
+ * cleanup has been done or not. In the worst case it
+ * loops once more. We pretend that the cleanup was
+ * done as there is no way to return. Either the
+ * OWNER_DIED bit is set by now or we push the blocked
+ * task into the wait for ever nirwana as well.
+ */
+ tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context();
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}

+ /*
+ * tsk->flags are checked in the futex code to protect against
+ * an exiting task cleaning up the robust pi futexes.
+ */
+ spin_lock_irq(&tsk->pi_lock);
tsk->flags |= PF_EXITING;
+ spin_unlock_irq(&tsk->pi_lock);

if (unlikely(in_atomic()))
printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -956,6 +972,12 @@ fastcall NORET_TYPE void do_exit(long code)
* Make sure we are holding no locks:
*/
debug_check_no_locks_held(tsk);
+ /*
+ * We can do this unlocked here. The futex code uses this flag
+ * just to verify whether the pi state cleanup has been done
+ * or not. In the worst case it loops once more.
+ */
+ tsk->flags |= PF_EXITPIDONE;

if (tsk->io_context)
exit_io_context();
diff --git a/kernel/futex.c b/kernel/futex.c
index 1df411e..99dad33 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -390,18 +390,12 @@ static struct task_struct * futex_find_get_task(pid_t pid)

rcu_read_lock();
p = find_task_by_pid(pid);
- if (!p)
- goto out_unlock;
- if ((current->euid != p->euid) && (current->euid != p->uid)) {
- p = NULL;
- goto out_unlock;
- }
- if (p->exit_state != 0) {
- p = NULL;
- goto out_unlock;
- }
- get_task_struct(p);
-out_unlock:
+
+ if (!p || ((current->euid != p->euid) && (current->euid != p->uid)))
+ p = ERR_PTR(-ESRCH);
+ else
+ get_task_struct(p);
+
rcu_read_unlock();

return p;
@@ -467,7 +461,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
struct futex_q *this, *next;
struct list_head *head;
struct task_struct *p;
- pid_t pid;
+ pid_t pid = uval & FUTEX_TID_MASK;

head = &hb->chain;

@@ -485,6 +479,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
return -EINVAL;

WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(pid && pi_state->owner &&
+ pi_state->owner->pid != pid);

atomic_inc(&pi_state->refcount);
me->pi_state = pi_state;
@@ -495,15 +491,33 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)

/*
* We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when the owner died bit is set
- * and TID = 0:
+ * the new pi_state to it, but bail out when TID = 0
*/
- pid = uval & FUTEX_TID_MASK;
- if (!pid && (uval & FUTEX_OWNER_DIED))
+ if (!pid)
return -ESRCH;
p = futex_find_get_task(pid);
- if (!p)
- return -ESRCH;
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+
+ /*
+ * We need to look at the task state flags to figure out,
+ * whether the task is exiting. To protect against the do_exit
+ * change of the task flags, we do this protected by
+ * p->pi_lock:
+ */
+ spin_lock_irq(&p->pi_lock);
+ if (unlikely(p->flags & PF_EXITING)) {
+ /*
+ * The task is on the way out. When PF_EXITPIDONE is
+ * set, we know that the task has finished the
+ * cleanup:
+ */
+ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+
+ spin_unlock_irq(&p->pi_lock);
+ put_task_struct(p);
+ return ret;
+ }

pi_state = alloc_pi_state();

@@ -516,7 +530,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
/* Store the key for possible exit cleanups: */
pi_state->key = me->key;

- spin_lock_irq(&p->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
pi_state->owner = p;
@@ -583,15 +596,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
* preserve the owner died bit.)
*/
if (!(uval & FUTEX_OWNER_DIED)) {
+ int ret = 0;
+
newval = FUTEX_WAITERS | new_owner->pid;

pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
pagefault_enable();
+
if (curval == -EFAULT)
- return -EFAULT;
+ ret = -EFAULT;
if (curval != uval)
- return -EINVAL;
+ ret = -EINVAL;
+ if (ret) {
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
+ return ret;
+ }
}

spin_lock_irq(&pi_state->owner->pi_lock);
@@ -1149,6 +1169,7 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
if (unlikely(ret != 0))
goto out_release_sem;

+ retry_unlocked:
hb = queue_lock(&q, -1, NULL);

retry_locked:
@@ -1200,34 +1221,58 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
ret = lookup_pi_state(uval, hb, &q);

if (unlikely(ret)) {
- /*
- * There were no waiters and the owner task lookup
- * failed. When the OWNER_DIED bit is set, then we
- * know that this is a robust futex and we actually
- * take the lock. This is safe as we are protected by
- * the hash bucket lock. We also set the waiters bit
- * unconditionally here, to simplify glibc handling of
- * multiple tasks racing to acquire the lock and
- * cleanup the problems which were left by the dead
- * owner.
- */
- if (curval & FUTEX_OWNER_DIED) {
- uval = newval;
- newval = current->pid |
- FUTEX_OWNER_DIED | FUTEX_WAITERS;
+ switch (ret) {

- pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr,
- uval, newval);
- pagefault_enable();
+ case -EAGAIN:
+ /*
+ * Task is exiting and we just wait for the
+ * exit to complete.
+ */
+ queue_unlock(&q, hb);
+ up_read(&curr->mm->mmap_sem);
+ cond_resched();
+ goto retry;

- if (unlikely(curval == -EFAULT))
+ case -ESRCH:
+ /*
+ * No owner found for this futex. Check if the
+ * OWNER_DIED bit is set to figure out whether
+ * this is a robust futex or not.
+ */
+ if (get_futex_value_locked(&curval, uaddr))
goto uaddr_faulted;
- if (unlikely(curval != uval))
- goto retry_locked;
- ret = 0;
+
+ /*
+ * There were no waiters and the owner task lookup
+ * failed. When the OWNER_DIED bit is set, then we
+ * know that this is a robust futex and we actually
+ * take the lock. This is safe as we are protected by
+ * the hash bucket lock. We also set the waiters bit
+ * unconditionally here, to simplify glibc handling of
+ * multiple tasks racing to acquire the lock and
+ * cleanup the problems which were left by the dead
+ * owner.
+ */
+ if (curval & FUTEX_OWNER_DIED) {
+ uval = newval;
+ newval = current->pid |
+ FUTEX_OWNER_DIED | FUTEX_WAITERS;
+
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr,
+ uval,
+ newval);
+ pagefault_enable();
+
+ if (unlikely(curval == -EFAULT))
+ goto uaddr_faulted;
+ if (unlikely(curval != uval))
+ goto retry_locked;
+ ret = 0;
+ }
+ default:
+ goto out_unlock_release_sem;
}
- goto out_unlock_release_sem;
}

/*
@@ -1279,39 +1324,52 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
list_add(&q.pi_state->list, &current->pi_state_list);
spin_unlock_irq(&current->pi_lock);

- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
/*
* We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
+ * TID. This must be atomic as we have to preserve the
* owner died bit here.
*/
- ret = get_user(uval, uaddr);
+ ret = get_futex_value_locked(&uval, uaddr);
while (!ret) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr,
uval, newval);
+ pagefault_enable();
+
if (curval == -EFAULT)
ret = -EFAULT;
if (curval == uval)
break;
uval = curval;
}
- } else {
+ } else if (ret) {
/*
* Catch the rare case, where the lock was released
* when we were on the way back before we locked
* the hash bucket.
*/
- if (ret && q.pi_state->owner == curr) {
- if (rt_mutex_trylock(&q.pi_state->pi_mutex))
- ret = 0;
+ if (q.pi_state->owner == curr &&
+ rt_mutex_trylock(&q.pi_state->pi_mutex)) {
+ ret = 0;
+ } else {
+ /*
+ * Paranoia check. If we did not take the lock
+ * in the trylock above, then we should not be
+ * the owner of the rtmutex, neither the real
+ * nor the pending one:
+ */
+ if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
+ printk(KERN_ERR "futex_lock_pi: ret = %d "
+ "pi-mutex: %p pi-state %p\n", ret,
+ q.pi_state->pi_mutex.owner,
+ q.pi_state->owner);
}
- /* Unqueue and drop the lock */
- unqueue_me_pi(&q, hb);
- up_read(&curr->mm->mmap_sem);
}
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q, hb);
+ up_read(&curr->mm->mmap_sem);

if (!detect && ret == -EDEADLK && 0)
force_sig(SIGKILL, current);
@@ -1331,16 +1389,18 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
+ *
+ * ... and hb->lock. :-) --ANK
*/
+ queue_unlock(&q, hb);
+
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
- goto out_unlock_release_sem;
- }
- goto retry_locked;
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out_release_sem;
+ goto retry_unlocked;
}

- queue_unlock(&q, hb);
up_read(&curr->mm->mmap_sem);

ret = get_user(uval, uaddr);
@@ -1382,9 +1442,9 @@ retry:
goto out;

hb = hash_futex(&key);
+retry_unlocked:
spin_lock(&hb->lock);

-retry_locked:
/*
* To avoid races, try to do the TID -> 0 atomic transition
* again. If it succeeds then we can return without waking
@@ -1446,16 +1506,17 @@ pi_faulted:
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
+ *
+ * ... and hb->lock. :-) --ANK
*/
+ spin_unlock(&hb->lock);
+
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt)) {
- ret = -EFAULT;
- goto out_unlock;
- }
- goto retry_locked;
+ ret = futex_handle_fault((unsigned long)uaddr, attempt);
+ if (ret)
+ goto out;
+ goto retry_unlocked;
}
-
- spin_unlock(&hb->lock);
up_read(&current->mm->mmap_sem);

ret = get_user(uval, uaddr);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 4ab17da..dd5feae 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -212,6 +212,19 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
if (!waiter || !waiter->task)
goto out_unlock_pi;

+ /*
+ * Check the orig_waiter state. After we dropped the locks,
+ * the previous owner of the lock might have released the lock
+ * and made us the pending owner:
+ */
+ if (orig_waiter && !orig_waiter->task)
+ goto out_unlock_pi;
+
+ /*
+ * Drop out, when the task has no waiters. Note,
+ * top_waiter can be NULL, when we are in the deboosting
+ * mode!
+ */
if (top_waiter && (!task_has_pi_waiters(task) ||
top_waiter != task_top_pi_waiter(task)))
goto out_unlock_pi;
@@ -659,9 +672,16 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
* all over without going into schedule to try
* to get the lock now:
*/
- if (unlikely(!waiter.task))
+ if (unlikely(!waiter.task)) {
+ /*
+ * Reset the return value. We might
+ * have returned with -EDEADLK and the
+ * owner released the lock while we
+ * were walking the pi chain.
+ */
+ ret = 0;
continue;
-
+ }
if (unlikely(ret))
break;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 62db30c..907ab05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2814,17 +2814,21 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
unsigned long next_balance = jiffies + 60 * HZ;

for_each_domain(this_cpu, sd) {
- if (sd->flags & SD_BALANCE_NEWIDLE) {
+ unsigned long interval;
+
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+
+ if (sd->flags & SD_BALANCE_NEWIDLE)
/* If we've pulled tasks over stop searching: */
pulled_task = load_balance_newidle(this_cpu,
- this_rq, sd);
- if (time_after(next_balance,
- sd->last_balance + sd->balance_interval))
- next_balance = sd->last_balance
- + sd->balance_interval;
- if (pulled_task)
- break;
- }
+ this_rq, sd);
+
+ interval = msecs_to_jiffies(sd->balance_interval);
+ if (time_after(next_balance, sd->last_balance + interval))
+ next_balance = sd->last_balance + interval;
+ if (pulled_task)
+ break;
}
if (!pulled_task)
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 7ce69c1..c30781c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,24 +53,6 @@

struct kmem_cache *anon_vma_cachep;

-static inline void validate_anon_vma(struct vm_area_struct *find_vma)
-{
-#ifdef CONFIG_DEBUG_VM
- struct anon_vma *anon_vma = find_vma->anon_vma;
- struct vm_area_struct *vma;
- unsigned int mapcount = 0;
- int found = 0;
-
- list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
- mapcount++;
- BUG_ON(mapcount > 100000);
- if (vma == find_vma)
- found = 1;
- }
- BUG_ON(!found);
-#endif
-}
-
/* This must be called under the mmap_sem. */
int anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;

- if (anon_vma) {
+ if (anon_vma)
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
- }
}

void anon_vma_link(struct vm_area_struct *vma)
@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma)
if (anon_vma) {
spin_lock(&anon_vma->lock);
list_add_tail(&vma->anon_vma_node, &anon_vma->head);
- validate_anon_vma(vma);
spin_unlock(&anon_vma->lock);
}
}
@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma)
return;

spin_lock(&anon_vma->lock);
- validate_anon_vma(vma);
list_del(&vma->anon_vma_node);

/* We must garbage collect the anon_vma if it's empty */
diff --git a/mm/sparse.c b/mm/sparse.c
index ac26eb0..faa08e2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -209,6 +209,12 @@ static int sparse_init_one_section(struct mem_section *ms,
return 1;
}

+__attribute__((weak))
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+ return NULL;
+}
+
static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
{
struct page *map;
@@ -219,6 +225,11 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
if (map)
return map;

+ map = alloc_bootmem_high_node(NODE_DATA(nid),
+ sizeof(struct page) * PAGES_PER_SECTION);
+ if (map)
+ return map;
+
map = alloc_bootmem_node(NODE_DATA(nid),
sizeof(struct page) * PAGES_PER_SECTION);
if (map)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/