--- ./drivers/block/Kconfig.drbd	2004-08-14 14:54:51.000000000 +0400
+++ ./drivers/block/Kconfig	2006-02-17 09:23:43.000000000 +0300
@@ -349,4 +349,6 @@ config LBD
 
 source "drivers/s390/block/Kconfig"
 
+source "drivers/block/drbd/Kconfig"
+
 endmenu
--- ./drivers/block/Makefile.drbd	2005-11-07 14:44:50.912255136 +0300
+++ ./drivers/block/Makefile	2005-11-07 14:42:33.712112728 +0300
@@ -35,6 +35,7 @@ obj-$(CONFIG_BLK_DEV_XD)	+= xd.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)	+= DAC960.o
 obj-$(CONFIG_ATA_OVER_ETH)	+= aoe/
+obj-$(CONFIG_BLK_DEV_DRBD)	+= drbd/
 
 obj-$(CONFIG_BLK_DEV_UMEM)	+= umem.o
 obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/Kconfig	2004-09-21 11:28:38.000000000 +0400
@@ -0,0 +1,34 @@
+#
+# DRBD device driver configuration
+#
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed replicated block device support"
+	select INET
+	select PROC_FS
+	---help---
+	  Drbd is a block device which is designed to build high availability
+	  clusters.  This is done by mirroring a whole block device via (a
+	  dedicated) network.  You could see it as a network RAID 1.
+
+	  Each device (drbd provides more than one of these devices) has a
+	  state, which can be 'primary' or 'secondary'. On the node with the
+	  primary device the application is supposed to run and to access the
+	  device (/dev/drbdX). Every write is sent to the local 'lower level
+	  block device' and via network to the node with the device in
+	  'secondary' state.
+	  The secondary device simply writes the data to its lower level block
+	  device. Reads are always carried out locally.
+
+	  Drbd management is done through user-space tools.
+
+	  Historically DRBD hijacked the NBD major number (43)
+	  and device nodes (/dev/nbX).
+	  We now have an officially assigned major number (147)
+	  and /dev/drbdX.
+
+	  If for whatever weird reason you want to keep the old behaviour,
+	  you can give a "use_nbd_major" module parameter.
+
+	  http://www.drbd.org/
+
+	  If unsure, say N.
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/Makefile	2006-02-17 09:23:43.000000000 +0300
@@ -0,0 +1,7 @@
+CFLAGS_drbd_sizeof_sanity_check.o = # -Wpadded # -Werror
+
+drbd-objs  :=	drbd_sizeof_sanity_check.o \
+		drbd_buildtag.o drbd_bitmap.o drbd_fs.o drbd_proc.o \
+		drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o \
+		lru_cache.o drbd_main.o
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_actlog.c	2006-01-26 14:56:50.000000000 +0300
@@ -0,0 +1,964 @@
+/*
+-*- linux-c -*-
+   drbd_actlog.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 2003-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2004, Lars Ellenberg <l.g.e@web.de>.
+        authors.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* This is what I like so much about the linux kernel:
+ * if you have a close look, you can almost always reuse code by someone else
+ * ;)
+ * this is mostly from drivers/md/md.c
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+STATIC int _drbd_md_sync_page_io(drbd_dev *mdev, struct page *page, 
+				 sector_t sector, int rw, int size)
+{
+	struct buffer_head bh;
+	struct completion event;
+	int ok;
+
+	init_completion(&event);
+	init_buffer(&bh, drbd_md_io_complete, &event);
+	bh.b_rdev = mdev->md_bdev;
+	bh.b_rsector = sector;
+	bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock);
+	bh.b_size = size; 
+	bh.b_page = page;
+	bh.b_reqnext = NULL;
+	bh.b_data = page_address(page);
+	generic_make_request(rw, &bh);
+
+	run_task_queue(&tq_disk);
+	wait_for_completion(&event);
+
+	ok = test_bit(BH_Uptodate, &bh.b_state);
+
+	return ok;
+}
+#else
+STATIC int _drbd_md_sync_page_io(drbd_dev *mdev, struct page *page, 
+				 sector_t sector, int rw, int size)
+{
+	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+	struct completion event;
+	int ok;
+
+	bio->bi_bdev = mdev->md_bdev;
+	bio->bi_sector = sector;
+	bio_add_page(bio, page, size, 0);
+	init_completion(&event);
+	bio->bi_private = &event;
+	bio->bi_end_io = drbd_md_io_complete;
+
+#ifdef BIO_RW_SYNC
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+#else
+	submit_bio(rw, bio);
+	drbd_blk_run_queue(bdev_get_queue(mdev->md_bdev));
+#endif
+	wait_for_completion(&event);
+
+	ok = test_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_put(bio);
+	return ok;
+}
+#endif
+
+int drbd_md_sync_page_io(drbd_dev *mdev, sector_t sector, int rw)
+{
+	int hardsect,mask,ok,offset=0;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	struct page *iop = mdev->md_io_page;
+
+	D_ASSERT(semaphore_is_locked(&mdev->md_io_mutex));
+
+	if (!mdev->md_bdev) {
+		if (test_bit(DISKLESS,&mdev->flags)) return 0;
+		if (DRBD_ratelimit(5*HZ,5)) {
+			ERR("mdev->md_bdev==NULL\n");
+			dump_stack();
+		}
+		return 0;
+	}
+
+
+	hardsect = drbd_get_hardsect(mdev->md_bdev);
+
+	// in case hardsect != 512 [ s390 only? ]
+	if( hardsect != MD_HARDSECT ) {
+		if(!mdev->md_io_tmpp) {
+			struct page *page = alloc_page(GFP_KERNEL);
+			if(!page) return 0;
+
+			WARN("Meta data's bdev hardsect_size != %d\n",
+			     MD_HARDSECT);
+			WARN("Workaround engaged (has performace impact).\n");
+
+			mdev->md_io_tmpp = page;
+		}
+
+		mask = ( hardsect / MD_HARDSECT ) - 1;
+		D_ASSERT( mask == 1 || mask == 3 || mask == 7 );
+		D_ASSERT( hardsect == (mask+1) * MD_HARDSECT );
+		offset = sector & mask;
+		sector = sector & ~mask;
+		iop = mdev->md_io_tmpp;
+
+		if (rw == WRITE) {
+			void *p = page_address(mdev->md_io_page);
+			void *hp = page_address(mdev->md_io_tmpp);
+
+			ok = _drbd_md_sync_page_io(mdev,iop,
+						   sector,READ,hardsect);
+
+			if (unlikely(!ok)) return 0;
+
+			memcpy(hp + offset*MD_HARDSECT , p, MD_HARDSECT);
+		}
+	}
+
+#if DUMP_MD >= 3
+	INFO("%s [%d]:%s(,%ld,%s)\n",
+	     current->comm, current->pid, __func__,
+	     sector, rw ? "WRITE" : "READ");
+#endif
+
+	if (sector < drbd_md_ss(mdev)  ||
+	    sector > drbd_md_ss(mdev)+MD_BM_OFFSET+BM_SECT_TO_EXT(capacity)) {
+		ALERT("%s [%d]:%s(,%llu,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     (unsigned long long)sector, rw ? "WRITE" : "READ");
+	}
+
+	ok = _drbd_md_sync_page_io(mdev,iop,sector,rw,hardsect);
+	if (unlikely(!ok)) {
+		ERR("drbd_md_sync_page_io(,%llu,%s) failed!\n",
+		    (unsigned long long)sector,rw ? "WRITE" : "READ");
+	}
+
+	if( hardsect != MD_HARDSECT && rw == READ ) {
+		void *p = page_address(mdev->md_io_page);
+		void *hp = page_address(mdev->md_io_tmpp);
+
+		memcpy(p, hp + offset*MD_HARDSECT, MD_HARDSECT);
+	}
+
+	return ok;
+}
+
+
+struct __attribute__((packed)) al_transaction {
+	u32       magic;
+	u32       tr_number;
+	// u32       tr_generation; //TODO
+	struct __attribute__((packed)) {
+		u32 pos;
+		u32 extent; } updates[1 + AL_EXTENTS_PT];
+	u32       xor_sum;
+       // I do not believe that all storage medias can guarantee atomic
+       // 512 byte write operations. When the journal is read, only
+       // transactions with correct xor_sums are considered.
+};     // sizeof() = 512 byte
+
+
+struct update_odbm_work {
+	struct drbd_work w;
+	unsigned int enr;
+};
+
+struct update_al_work {
+	struct drbd_work w;
+	struct lc_element * al_ext;
+	struct completion event;
+	unsigned int enr;
+};
+
+STATIC int w_al_write_transaction(struct Drbd_Conf *, struct drbd_work *, int);
+
+static inline
+struct lc_element* _al_get(struct Drbd_Conf *mdev, unsigned int enr)
+{
+	struct lc_element *al_ext;
+	struct bm_extent  *bm_ext;
+	unsigned long     al_flags=0;
+
+	spin_lock_irq(&mdev->al_lock);
+	bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr/AL_EXT_PER_BM_SECT);
+	if (unlikely(bm_ext!=NULL)) {
+		if(test_bit(BME_NO_WRITES,&bm_ext->flags)) {
+			spin_unlock_irq(&mdev->al_lock);
+			//INFO("Delaying app write until sync read is done\n");
+			return 0;
+		}
+	}
+	al_ext   = lc_get(mdev->act_log,enr);
+	al_flags = mdev->act_log->flags;
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if (!al_ext) {
+		if (al_flags & LC_STARVING)
+			WARN("Have to wait for LRU element (AL too small?)\n");
+		if (al_flags & LC_DIRTY)
+			WARN("Ongoing AL update (AL device too slow?)\n");
+	}
+	*/
+
+	return al_ext;
+}
+
+void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9));
+	struct lc_element *al_ext;
+	struct update_al_work al_work;
+
+	D_ASSERT(atomic_read(&mdev->local_cnt)>0);
+	wait_event(mdev->al_wait, (al_ext = _al_get(mdev,enr)) );
+
+	if (al_ext->lc_number != enr) {
+		// We have to do write an transaction to AL.
+		unsigned int evicted;
+
+		evicted = al_ext->lc_number;
+
+		if(mdev->cstate < Connected && evicted != LC_FREE ) {
+			drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT );
+		}
+
+		/* drbd_al_write_transaction(mdev,al_ext,enr);
+		   generic_make_request() are serialized on the 
+		   current->bio_tail list now. Therefore we have
+		   to deligate writing something to AL to the
+		   worker thread. */
+		init_completion(&al_work.event);
+		al_work.al_ext = al_ext;
+		al_work.enr = enr;
+		al_work.w.cb = w_al_write_transaction;
+		drbd_queue_work_front(mdev,&mdev->data.work,&al_work.w);
+		wait_for_completion(&al_work.event);
+		
+		mdev->al_writ_cnt++;
+
+		/*
+		DUMPI(al_ext->lc_number);
+		DUMPI(mdev->act_log->new_number);
+		*/
+		spin_lock_irq(&mdev->al_lock);
+		lc_changed(mdev->act_log,al_ext);
+		spin_unlock_irq(&mdev->al_lock);
+		wake_up(&mdev->al_wait);
+	}
+}
+
+void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector)
+{
+	unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9));
+	struct lc_element *extent;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mdev->al_lock,flags);
+
+	extent = lc_find(mdev->act_log,enr);
+
+	if(!extent) {
+		spin_unlock_irqrestore(&mdev->al_lock,flags);
+		ERR("al_complete_io() called on inactive extent %u\n",enr);
+		return;
+	}
+
+	if( lc_put(mdev->act_log,extent) == 0 ) {
+		wake_up(&mdev->al_wait);
+	}
+
+	spin_unlock_irqrestore(&mdev->al_lock,flags);
+}
+
+STATIC int
+w_al_write_transaction(struct Drbd_Conf *mdev, struct drbd_work *w, int unused)
+{
+	int i,n,mx;
+	unsigned int extent_nr;
+	struct al_transaction* buffer;
+	sector_t sector;
+	u32 xor_sum=0;
+
+	struct lc_element *updated = ((struct update_al_work*)w)->al_ext;
+	unsigned int new_enr = ((struct update_al_work*)w)->enr;
+
+	down(&mdev->md_io_mutex); // protects md_io_buffer, al_tr_cycle, ...
+	buffer = (struct al_transaction*)page_address(mdev->md_io_page);
+
+	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
+
+	n = lc_index_of(mdev->act_log, updated);
+
+	buffer->updates[0].pos = cpu_to_be32(n);
+	buffer->updates[0].extent = cpu_to_be32(new_enr);
+
+#if 0	/* Use this printf with the test_al.pl program */
+	ERR("T%03d S%03d=E%06d\n", mdev->al_tr_number,n,new_enr);
+#endif
+
+	xor_sum ^= new_enr;
+
+	mx = min_t(int,AL_EXTENTS_PT,
+		   mdev->act_log->nr_elements - mdev->al_tr_cycle);
+	for(i=0;i<mx;i++) {
+		extent_nr = lc_entry(mdev->act_log,
+				     mdev->al_tr_cycle+i)->lc_number;
+		buffer->updates[i+1].pos = cpu_to_be32(mdev->al_tr_cycle+i);
+		buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
+		xor_sum ^= extent_nr;
+	}
+	for(;i<AL_EXTENTS_PT;i++) {
+		buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
+		buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
+		xor_sum ^= LC_FREE;
+	}
+	mdev->al_tr_cycle += AL_EXTENTS_PT;
+	if(mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle=0;
+
+	buffer->xor_sum = cpu_to_be32(xor_sum);
+
+
+	sector = drbd_md_ss(mdev) + MD_AL_OFFSET + mdev->al_tr_pos ;
+
+	if(!drbd_md_sync_page_io(mdev,sector,WRITE)) {
+		drbd_chk_io_error(mdev, 1);
+		drbd_io_error(mdev);
+	}
+
+	if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) {
+		mdev->al_tr_pos=0;
+	}
+	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
+	mdev->al_tr_number++;
+
+	up(&mdev->md_io_mutex);
+
+	complete(&((struct update_al_work*)w)->event);
+
+	return 1;
+}
+
+STATIC int drbd_al_read_tr(struct Drbd_Conf *mdev,
+			   struct al_transaction* b,
+			   int index)
+{
+	sector_t sector;
+	int rv,i;
+	u32 xor_sum=0;
+
+	sector = drbd_md_ss(mdev) + MD_AL_OFFSET + index;
+
+	if(!drbd_md_sync_page_io(mdev,sector,READ)) {
+		drbd_chk_io_error(mdev, 1);
+		drbd_io_error(mdev);
+		return 0;
+	}
+
+	rv = ( be32_to_cpu(b->magic) == DRBD_MAGIC );
+
+	for(i=0;i<AL_EXTENTS_PT+1;i++) {
+		xor_sum ^= be32_to_cpu(b->updates[i].extent);
+	}
+	rv &= (xor_sum == be32_to_cpu(b->xor_sum));
+
+	return rv;
+}
+
+void drbd_al_read_log(struct Drbd_Conf *mdev)
+{
+	struct al_transaction* buffer;
+	int from=-1,to=-1,i,cnr, overflow=0,rv;
+	u32 from_tnr=-1, to_tnr=0;
+	int active_extents=0;
+	int transactions=0;
+	int mx;
+
+	mx = div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT);
+
+	/* lock out all other meta data io for now,
+	 * and make sure the page is mapped.
+	 */
+	down(&mdev->md_io_mutex);
+	buffer = page_address(mdev->md_io_page);
+
+	// Find the valid transaction in the log
+	for(i=0;i<=mx;i++) {
+		if(!drbd_al_read_tr(mdev,buffer,i)) continue;
+		cnr = be32_to_cpu(buffer->tr_number);
+		// INFO("index %d valid tnr=%d\n",i,cnr);
+
+		if(cnr == -1) overflow=1;
+
+		if(cnr < from_tnr && !overflow) {
+			from = i;
+			from_tnr = cnr;
+		}
+		if(cnr > to_tnr) {
+			to = i;
+			to_tnr = cnr;
+		}
+	}
+
+	if(from == -1 || to == -1) {
+		WARN("No usable activity log found.\n");
+
+		up(&mdev->md_io_mutex);
+		return;
+	}
+
+	// Read the valid transactions.
+	// INFO("Reading from %d to %d.\n",from,to);
+
+	/* this should better be handled by a for loop, no?
+	 */
+	i=from;
+	while(1) {
+		int j,pos;
+		unsigned int extent_nr;
+		unsigned int trn;
+
+		rv = drbd_al_read_tr(mdev,buffer,i);
+		ERR_IF(!rv) goto cancel;
+
+		trn=be32_to_cpu(buffer->tr_number);
+
+		spin_lock_irq(&mdev->al_lock);
+		for(j=0;j<AL_EXTENTS_PT+1;j++) {
+			pos = be32_to_cpu(buffer->updates[j].pos);
+			extent_nr = be32_to_cpu(buffer->updates[j].extent);
+
+			if(extent_nr == LC_FREE) continue;
+
+		       //if(j<3) INFO("T%03d S%03d=E%06d\n",trn,pos,extent_nr);
+			lc_set(mdev->act_log,extent_nr,pos);
+			active_extents++;
+		}
+		spin_unlock_irq(&mdev->al_lock);
+
+		transactions++;
+
+	cancel:
+		if( i == to) break;
+		i++;
+		if( i > mx ) i=0;
+	}
+
+	mdev->al_tr_number = to_tnr+1;
+	mdev->al_tr_pos = to;
+	if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) {
+		mdev->al_tr_pos=0;
+	}
+
+	/* ok, we are done with it */
+	up(&mdev->md_io_mutex);
+
+	INFO("Found %d transactions (%d active extents) in activity log.\n",
+	     transactions,active_extents);
+}
+
+/**
+ * drbd_al_to_on_disk_bm:
+ * Writes the areas of the bitmap which are covered by the AL.
+ * called when we detach (unconfigure) local storage,
+ * or when we go from Primary to Secondary state.
+ */
+void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev)
+{
+	int i;
+	unsigned int enr;
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	i=inc_local_md_only(mdev);
+	D_ASSERT( i ); // Assertions should not have side effects.
+	// I do not want to have D_ASSERT( inc_local_md_only(mdev) );
+
+	for(i=0;i<mdev->act_log->nr_elements;i++) {
+		enr = lc_entry(mdev->act_log,i)->lc_number;
+		if(enr == LC_FREE) continue;
+		/* TODO encapsulate and optimize within drbd_bitmap
+		 * currently, if we have al-extents 16..19 active,
+		 * sector 4 will be written four times! */
+		drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT );
+	}
+
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+	dec_local(mdev);
+}
+
+/**
+ * drbd_al_apply_to_bm: Sets the bits in the bitmap that are described
+ * by the active extents of the AL.
+ */
+void drbd_al_apply_to_bm(struct Drbd_Conf *mdev)
+{
+	unsigned int enr;
+	unsigned long add=0;
+	char ppb[10];
+	int i;
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+
+	for(i=0;i<mdev->act_log->nr_elements;i++) {
+		enr = lc_entry(mdev->act_log,i)->lc_number;
+		if(enr == LC_FREE) continue;
+		add += drbd_bm_ALe_set_all(mdev, enr);
+	}
+
+	lc_unlock(mdev->act_log);
+	wake_up(&mdev->al_wait);
+
+	INFO("Marked additional %s as out-of-sync based on AL.\n",
+	     ppsize(ppb,add >> 1));
+}
+
+static inline int _try_lc_del(struct Drbd_Conf *mdev,struct lc_element *al_ext)
+{
+	int rv;
+
+	spin_lock_irq(&mdev->al_lock);
+	rv = (al_ext->refcnt == 0);
+	if(likely(rv)) lc_del(mdev->act_log,al_ext);
+	spin_unlock_irq(&mdev->al_lock);
+
+	if(unlikely(!rv)) INFO("Waiting for extent in drbd_al_shrink()\n");
+
+	return rv;
+}
+
+/**
+ * drbd_al_shrink: Removes all active extents form the AL. (but does not
+ * write any transactions)
+ * You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct Drbd_Conf *mdev)
+{
+	struct lc_element *al_ext;
+	int i;
+
+	D_ASSERT( test_bit(__LC_DIRTY,&mdev->act_log->flags) );
+
+	for(i=0;i<mdev->act_log->nr_elements;i++) {
+		al_ext = lc_entry(mdev->act_log,i);
+		if(al_ext->lc_number == LC_FREE) continue;
+		wait_event(mdev->al_wait, _try_lc_del(mdev,al_ext));
+	}
+
+	wake_up(&mdev->al_wait);
+}
+
+STATIC int w_update_odbm(drbd_dev *mdev, struct drbd_work *w, int unused)
+{
+	struct update_odbm_work *udw = (struct update_odbm_work*)w;
+
+	if( !inc_local_md_only(mdev) ) {
+		if (DRBD_ratelimit(5*HZ,5))
+			WARN("Can not update on disk bitmap, local IO disabled.\n");
+		return 1;
+	}
+
+	drbd_bm_write_sect(mdev, udw->enr );
+	dec_local(mdev);
+
+	kfree(udw);
+
+	if(drbd_bm_total_weight(mdev) == 0 &&
+	   ( mdev->cstate == SyncSource || mdev->cstate == SyncTarget ||
+	     mdev->cstate == PausedSyncS || mdev->cstate == PausedSyncT ) ) {
+		D_ASSERT( mdev->resync_work.cb == w_resync_inactive );
+		drbd_bm_lock(mdev);
+		drbd_resync_finished(mdev);
+		drbd_bm_unlock(mdev);
+	}
+
+	return 1;
+}
+
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the  *
+ * resync LRU-cache are 16MB each.                                     *
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+STATIC void drbd_try_clear_on_disk_bm(struct Drbd_Conf *mdev,sector_t sector,
+				      int cleared)
+{
+	struct list_head *le, *tmp;
+	struct bm_extent* ext;
+	struct update_odbm_work * udw;
+
+	unsigned int enr;
+
+	MUST_HOLD(&mdev->al_lock);
+
+	// I simply assume that a sector/size pair never crosses
+	// a 16 MB extent border. (Currently this is true...)
+	enr = BM_SECT_TO_EXT(sector);
+
+	ext = (struct bm_extent *) lc_get(mdev->resync,enr);
+	if (ext) {
+		if( ext->lce.lc_number == enr) {
+			ext->rs_left -= cleared;
+			if (ext->rs_left < 0) {
+				ERR("BAD! sector=%lu enr=%u rs_left=%d cleared=%d\n",
+				     (unsigned long)sector,
+				     ext->lce.lc_number, ext->rs_left, cleared);
+				// FIXME brrrgs. should never happen!
+				_set_cstate(mdev,StandAlone);
+				drbd_thread_stop_nowait(&mdev->receiver);
+				return;
+			}
+		} else {
+			//WARN("Recounting sectors in %d (resync LRU too small?)\n", enr);
+			// This element should be in the cache
+			// since drbd_rs_begin_io() pulled it already in.
+			int rs_left = drbd_bm_e_weight(mdev,enr);
+			if (ext->flags != 0) {
+				WARN("changing resync lce: %d[%u;%02lx]"
+				     " -> %d[%u;00]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags, enr, rs_left);
+				ext->flags = 0;
+			}
+			ext->rs_left = rs_left;
+			lc_changed(mdev->resync,&ext->lce);
+		}
+		lc_put(mdev->resync,&ext->lce);
+		// no race, we are within the al_lock!
+	} else {
+		ERR("lc_get() failed! locked=%d/%d flags=%lu\n",
+		    atomic_read(&mdev->resync_locked), 
+		    mdev->resync->nr_elements,
+		    mdev->resync->flags);
+	}
+
+	list_for_each_safe(le,tmp,&mdev->resync->lru) {
+		ext=(struct bm_extent *)list_entry(le,struct lc_element,list);
+		if(ext->rs_left == 0) {
+			udw=kmalloc(sizeof(*udw),GFP_ATOMIC);
+			if(!udw) {
+				WARN("Could not kmalloc an udw\n");
+				break;
+			}
+			udw->enr = ext->lce.lc_number;
+			udw->w.cb = w_update_odbm;
+			drbd_queue_work_front(mdev,&mdev->data.work,&udw->w);
+			if (ext->flags != 0) {
+				WARN("deleting resync lce: %d[%u;%02lx]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags);
+				ext->flags = 0;
+			}
+			lc_del(mdev->resync,&ext->lce);
+		}
+	}
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on SyncTarget and receiver on SyncSource.
+ *
+ */
+void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr,ebnr,lbnr,bnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+	int strange_state,wake_up=0;
+
+	strange_state = (mdev->cstate <= Connected) ||
+	                test_bit(DISKLESS,&mdev->flags) ||
+	                test_bit(PARTNER_DISKLESS,&mdev->flags);
+	if (strange_state) {
+		ERR("%s:%d: %s flags=0x%02lx\n", file , line ,
+				cstate_to_name(mdev->cstate), mdev->flags);
+	}
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) {
+		ERR("drbd_set_in_sync: sector=%lu size=%d nonsense!\n",
+				(unsigned long)sector,size);
+		return;
+	}
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size>>9) -1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we clear it (in sync).
+	 * round up start sector, round down end sector.  we make sure we only
+	 * clear full, alligned, BM_BLOCK_SIZE (4K) blocks */
+	if (unlikely(esector < BM_SECT_PER_BIT-1)) {
+		return;
+	} else if (unlikely(esector == (nr_sectors-1))) {
+		ebnr = lbnr;
+	} else {
+		ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+	}
+	sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+
+#ifdef DUMP_EACH_PACKET
+	INFO("drbd_set_in_sync: sector=%lu size=%d sbnr=%lu ebnr=%lu\n",
+			(unsigned long)sector, size, sbnr, ebnr);
+#endif
+
+	if (sbnr > ebnr) return;
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	spin_lock_irq(&mdev->al_lock);
+	for(bnr=sbnr; bnr <= ebnr; bnr++) {
+		if (drbd_bm_clear_bit(mdev,bnr)) count++;
+	}
+	if (count) {
+		// we need the lock for drbd_try_clear_on_disk_bm
+		if(jiffies - mdev->rs_mark_time > HZ*10) {
+			/* should be roling marks, but we estimate only anyways. */
+			if( mdev->rs_mark_left != drbd_bm_total_weight(mdev)) {
+				mdev->rs_mark_time =jiffies;
+				mdev->rs_mark_left =drbd_bm_total_weight(mdev);
+			}
+		}
+		drbd_try_clear_on_disk_bm(mdev,sector,count);
+		/* just wake_up unconditional now,
+		 * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */
+		wake_up=1;
+	}
+	spin_unlock_irq(&mdev->al_lock);
+	if(wake_up) wake_up(&mdev->al_wait);
+}
+
+/*
+ * this is intended to set one request worth of data out of sync.
+ * affects at least 1 bit, and at most 1+PAGE_SIZE/BM_BLOCK_SIZE bits.
+ *
+ * called by tl_clear and drbd_send_dblock (==drbd_make_request).
+ * so this can be _any_ process.
+ */
+void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line)
+{
+	unsigned long sbnr,ebnr,lbnr,bnr;
+	sector_t esector, nr_sectors;
+	int strange_state;
+
+	strange_state = ( mdev->cstate  > Connected ) ||
+	                ( mdev->cstate == Connected &&
+	                 !(test_bit(DISKLESS,&mdev->flags) ||
+	                   test_bit(PARTNER_DISKLESS,&mdev->flags)) );
+	if (strange_state) {
+		ERR("%s:%d: %s flags=0x%02lx\n", file , line ,
+				cstate_to_name(mdev->cstate), mdev->flags);
+	}
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) {
+		ERR("sector: %lu, size: %d\n",(unsigned long)sector,size);
+		return;
+	}
+
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size>>9) -1;
+
+	ERR_IF(sector >= nr_sectors) return;
+	ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	/* we set it out of sync,
+	 * we do not need to round anything here */
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	/*
+	 * ok, (capacity & 7) != 0 sometimes, but who cares...
+	 * we count rs_{total,left} in bits, not sectors.
+	 */
+	for(bnr=sbnr; bnr <= ebnr; bnr++) drbd_bm_set_bit(mdev,bnr);
+}
+
+static inline
+struct bm_extent* _bme_get(struct Drbd_Conf *mdev, unsigned int enr)
+{
+	struct bm_extent  *bm_ext;
+	int wakeup = 0;
+	unsigned long     rs_flags;
+
+	if(atomic_read(&mdev->resync_locked) > mdev->resync->nr_elements-3 ) {
+		//WARN("bme_get() does not lock all elements\n");
+		return 0;
+	}
+
+	spin_lock_irq(&mdev->al_lock);
+	bm_ext = (struct bm_extent*) lc_get(mdev->resync,enr);
+	if (bm_ext) {
+		if(bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(mdev,enr);
+			lc_changed(mdev->resync,(struct lc_element*)bm_ext);
+			wakeup = 1;
+		}
+		if(bm_ext->lce.refcnt == 1) atomic_inc(&mdev->resync_locked);
+		set_bit(BME_NO_WRITES,&bm_ext->flags); // within the lock
+	}
+	rs_flags=mdev->resync->flags;
+	spin_unlock_irq(&mdev->al_lock);
+	if (wakeup) wake_up(&mdev->al_wait);
+
+	if(!bm_ext) {
+		if (rs_flags & LC_STARVING) {
+			WARN("Have to wait for element"
+			     " (resync LRU too small?)\n");
+		}
+		if (rs_flags & LC_DIRTY) {
+			BUG(); // WARN("Ongoing RS update (???)\n");
+		}
+	}
+
+	return bm_ext;
+}
+
+static inline int _is_in_al(drbd_dev* mdev, unsigned int enr)
+{
+	struct lc_element* al_ext;
+	int rv=0;
+
+	spin_lock_irq(&mdev->al_lock);
+	if(unlikely(enr == mdev->act_log->new_number)) rv=1;
+	else {
+		al_ext = lc_find(mdev->act_log,enr);
+		if(al_ext) {
+			if (al_ext->refcnt) rv=1;
+		}
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
+	/*
+	if(unlikely(rv)) {
+		INFO("Delaying sync read until app's write is done\n");
+	}
+	*/
+	return rv;
+}
+
+/**
+ * drbd_rs_begin_io: Gets an extent in the resync LRU cache and sets it
+ * to BME_LOCKED.
+ *
+ * @sector: The sector number
+ */
+int drbd_rs_begin_io(drbd_dev* mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent* bm_ext;
+	int i, sig;
+
+	sig = wait_event_interruptible( mdev->al_wait,
+			(bm_ext = _bme_get(mdev,enr)) );
+	if (sig) return 0;
+
+	if(test_bit(BME_LOCKED,&bm_ext->flags)) return 1;
+
+	for(i=0;i<AL_EXT_PER_BM_SECT;i++) {
+		sig = wait_event_interruptible( mdev->al_wait,
+				!_is_in_al(mdev,enr*AL_EXT_PER_BM_SECT+i) );
+		if (sig) {
+			spin_lock_irq(&mdev->al_lock);
+			if( lc_put(mdev->resync,&bm_ext->lce) == 0 ) {
+				clear_bit(BME_NO_WRITES,&bm_ext->flags);
+				atomic_dec(&mdev->resync_locked);
+				wake_up(&mdev->al_wait);
+			}
+			spin_unlock_irq(&mdev->al_lock);
+			return 0;
+		}
+	}
+
+	set_bit(BME_LOCKED,&bm_ext->flags);
+
+	return 1;
+}
+
+void drbd_rs_complete_io(drbd_dev* mdev, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent* bm_ext;
+	unsigned long flags;
+
+	spin_lock_irqsave(&mdev->al_lock,flags);
+	bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr);
+	if(!bm_ext) {
+		spin_unlock_irqrestore(&mdev->al_lock,flags);
+		ERR("drbd_rs_complete_io() called, but extent not found\n");
+		return;
+	}
+
+	if( lc_put(mdev->resync,(struct lc_element *)bm_ext) == 0 ) {
+		clear_bit(BME_LOCKED,&bm_ext->flags);
+		clear_bit(BME_NO_WRITES,&bm_ext->flags);
+		atomic_dec(&mdev->resync_locked);
+		wake_up(&mdev->al_wait);
+	}
+
+	spin_unlock_irqrestore(&mdev->al_lock,flags);
+}
+
+/**
+ * drbd_rs_cancel_all: Removes extents from the resync LRU. Even
+ * if they are BME_LOCKED.
+ */
+void drbd_rs_cancel_all(drbd_dev* mdev)
+{
+	struct bm_extent* bm_ext;
+	int i;
+
+	spin_lock_irq(&mdev->al_lock);
+
+	for(i=0;i<mdev->resync->nr_elements;i++) {
+		bm_ext = (struct bm_extent*) lc_entry(mdev->resync,i);
+		if(bm_ext->lce.lc_number == LC_FREE) continue;
+		bm_ext->lce.refcnt = 0; // Rude but ok.
+		bm_ext->rs_left = 0;
+		clear_bit(BME_LOCKED,&bm_ext->flags);
+		clear_bit(BME_NO_WRITES,&bm_ext->flags);
+		lc_del(mdev->resync,&bm_ext->lce);
+	}
+	atomic_set(&mdev->resync_locked,0);   
+	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_bitmap.c	2006-02-10 15:23:38.000000000 +0300
@@ -0,0 +1,980 @@
+/*
+-*- linux-c -*-
+   drbd_bitmap.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 2004, Lars Ellenberg <l.g.e@web.de>.
+	main author.
+
+   Copyright (C) 2004, Philipp Reisner <philipp.reisner@linbit.com>.
+	contributions.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h> // for memset
+
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* special handling for ppc64 on 2.4 kernel -- find_next_bit is not exported
+ * so we include it here (verbatim, from linux 2.4.21 sources) */
+#if defined(__powerpc64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+
+unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset)
+{
+        unsigned long *p = addr + (offset >> 6);
+        unsigned long result = offset & ~63UL;
+        unsigned long tmp;
+
+        if (offset >= size)
+                return size;
+        size -= result;
+        offset &= 63UL;
+        if (offset) {
+                tmp = *(p++);
+                tmp &= (~0UL << offset);
+                if (size < 64)
+                        goto found_first;
+                if (tmp)
+                        goto found_middle;
+                size -= 64;
+                result += 64;
+        }
+        while (size & ~63UL) {
+                if ((tmp = *(p++)))
+                        goto found_middle;
+                result += 64;
+                size -= 64;
+        }
+        if (!size)
+                return result;
+        tmp = *p;
+
+found_first:
+        tmp &= (~0UL >> (64 - size));
+        if (tmp == 0UL)        /* Are any bits set? */
+                return result + size; /* Nope. */
+found_middle:
+        return result + __ffs(tmp);
+}
+#endif /* NEED_PPC64_WORKAROUND */
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+ *
+ * unfortunately this currently means that this file is not
+ * yet selfcontained, because it needs to know about how to receive
+ * the bitmap from the peer via the data socket.
+ * This is to be solved with some sort of
+ *  drbd_bm_copy(mdev,offset,size,unsigned long*) ...
+
+ * Note that since find_first_bit returns int, this implementation
+ * "only" supports up to 1<<(32+12) == 16 TB...  non issue, since
+ * currently DRBD is limited to ca 3.8 TB storage anyways.
+ *
+ * we will eventually change the implementation to not allways hold the full
+ * bitmap in memory, but only some 'lru_cache' of the on disk bitmap,
+ * since vmalloc'ing mostly unused 128M is antisocial.
+
+ * THINK
+ * I'm not yet sure whether this file should be bits only,
+ * or wether I want it to do all the sector<->bit calculation in here.
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bit is called from bio_endio callbacks,
+ *  so there we need a spin_lock_irqsave.
+ *  Everywhere else we need a spin_lock_irq.
+ *
+ * FIXME
+ *  Actually you need to serialize all resize operations.
+ *  but then, resize is a drbd state change, and it should be serialized
+ *  already. Unfortunately it is not (yet), so two concurrent resizes, like
+ *  attach storage (drbdsetup) and receive the peers size (drbd receiver)
+ *  may eventually blow things up.
+ * Therefore,
+ *  you may only change the other members when holding
+ *  the bm_change mutex _and_ the bm_lock.
+ *  thus reading them holding either is safe.
+ *  this is sort of overkill, but I rather do it right
+ *  than have two resize operations interfere somewhen.
+ */
+struct drbd_bitmap {
+	unsigned long *bm;
+	spinlock_t bm_lock;
+	unsigned long bm_fo;        // next offset for drbd_bm_find_next
+	unsigned long bm_set;       // nr of set bits; THINK maybe atomic_t ?
+	unsigned long bm_bits;
+	size_t   bm_words;
+	sector_t bm_dev_capacity;
+	struct semaphore bm_change; // serializes resize operations
+
+	// { REMOVE
+	unsigned long  bm_flags;     // currently debugging aid only
+	unsigned long  bm_line;
+	char          *bm_file;
+	// }
+};
+
+// { REMOVE once we serialize all state changes properly
+#define D_BUG_ON(x)	ERR_IF(x) { dump_stack(); }
+#define BM_LOCKED 0
+#if 0 // simply disabled for now...
+#define MUST_NOT_BE_LOCKED() do {					\
+	if (test_bit(BM_LOCKED,&b->bm_flags)) {				\
+		if (DRBD_ratelimit(5*HZ,5)) {				\
+			ERR("%s:%d: bitmap is locked by %s:%lu\n",	\
+			    __FILE__, __LINE__, b->bm_file,b->bm_line);	\
+			dump_stack();					\
+		}							\
+	}								\
+} while (0)
+#define MUST_BE_LOCKED() do {						\
+	if (!test_bit(BM_LOCKED,&b->bm_flags)) {			\
+		if (DRBD_ratelimit(5*HZ,5)) {				\
+			ERR("%s:%d: bitmap not locked!\n",		\
+					__FILE__, __LINE__);		\
+			dump_stack();					\
+		}							\
+	}								\
+} while (0)
+#else
+#define MUST_NOT_BE_LOCKED() do {(void)b;} while (0)
+#define MUST_BE_LOCKED() do {(void)b;} while (0)
+#endif 
+void __drbd_bm_lock(drbd_dev *mdev, char* file, int line)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	spin_lock_irq(&b->bm_lock);
+	if (!__test_and_set_bit(BM_LOCKED,&b->bm_flags)) {
+		b->bm_file = file;
+		b->bm_line = line;
+	} else if (DRBD_ratelimit(5*HZ,5)) {
+		ERR("%s:%d: bitmap already locked by %s:%lu\n",
+		    file, line, b->bm_file,b->bm_line);
+		/*
+		dump_stack();
+		ERR("This is no oops, but debug stack trace only.\n");
+		ERR("If you get this often, or in reproducable situations, "
+		    "notify <drbd-devel@linbit.com>\n");
+		*/
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+void drbd_bm_unlock(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	spin_lock_irq(&b->bm_lock);
+	if (!__test_and_clear_bit(BM_LOCKED,&mdev->bitmap->bm_flags)) {
+		ERR("bitmap not locked in bm_unlock\n");
+	} else {
+		/* FIXME if we got a "is already locked" previously,
+		 * we unlock here even though we actually MUST NOT do so... */
+		b->bm_file = NULL;
+		b->bm_line = -1;
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+#if 0
+// has been very helpful to indicate that rs_total and rs_left have been
+// used in a non-smp safe way...
+#define BM_PARANOIA_CHECK() do {						\
+	D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC);				\
+	D_ASSERT(b->bm_dev_capacity == drbd_get_capacity(mdev->this_bdev));	\
+	if ( (b->bm_set != mdev->rs_total) &&					\
+	     (b->bm_set != mdev->rs_left) ) {					\
+		if ( DRBD_ratelimit(5*HZ,5) ) {					\
+			ERR("%s:%d: ?? bm_set=%lu; rs_total=%lu, rs_left=%lu\n",\
+				__FILE__ , __LINE__ ,				\
+				b->bm_set, mdev->rs_total, mdev->rs_left );	\
+		}								\
+	}									\
+} while (0)
+#else
+#define BM_PARANOIA_CHECK() do {					\
+	D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC);			\
+	D_ASSERT(b->bm_dev_capacity == drbd_get_capacity(mdev->this_bdev));	\
+} while (0)
+#endif
+// }
+
+#if DUMP_MD >= 3
+/* debugging aid */
+STATIC void bm_end_info(drbd_dev *mdev, const char* where)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	size_t w = (b->bm_bits-1) >> LN2_BPL;
+
+	INFO("%s: bm_set=%lu\n", where, b->bm_set);
+	INFO("bm[%d]=0x%lX\n", w, b->bm[w]);
+	w++;
+
+	if ( w < b->bm_words ) {
+		D_ASSERT(w == b->bm_words -1);
+		INFO("bm[%d]=0x%lX\n",w,b->bm[w]);
+	}
+}
+#else
+#define bm_end_info(ignored...)	((void)(0))
+#endif
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s)	((s)<<(BM_EXT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * drbd_dev*, but for the debug macros I like to have the mdev around
+ * to be able to report device specific.
+ */
+
+/* FIXME TODO sometimes I use "int offset" as index into the bitmap.
+ * since we currently are LIMITED to (128<<11)-64-8 sectors of bitmap,
+ * this is ok [as long as we dont run on a 24 bit arch :)].
+ * But it is NOT strictly ok.
+ */
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in mdev->bitmap.
+ */
+int drbd_bm_init(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	D_BUG_ON(b);
+	b = kmalloc(sizeof(struct drbd_bitmap),GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	memset(b,0,sizeof(*b));
+	b->bm_lock = SPIN_LOCK_UNLOCKED;
+	init_MUTEX(&b->bm_change);
+	mdev->bitmap = b;
+	return 0;
+}
+
+sector_t drbd_bm_capacity(drbd_dev *mdev)
+{
+	ERR_IF(!mdev->bitmap) return 0;
+	return mdev->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(drbd_dev *mdev)
+{
+	ERR_IF (!mdev->bitmap) return;
+	/* FIXME I think we should explicitly change the device size to zero
+	 * before this...
+	 *
+	D_BUG_ON(mdev->bitmap->bm);
+	 */
+	vfree(mdev->bitmap->bm);
+	kfree(mdev->bitmap);
+	mdev->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Rerturns the number of bits cleared.
+ */
+STATIC int bm_clear_surplus(struct drbd_bitmap * b)
+{
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1;
+	size_t w = b->bm_bits >> LN2_BPL;
+	int cleared=0;
+
+	if ( w < b->bm_words ) {
+		cleared = hweight_long(b->bm[w] & ~mask);
+		b->bm[w++] &= mask;
+	}
+
+	if ( w < b->bm_words ) {
+		cleared += hweight_long(b->bm[w]);
+		b->bm[w++]=0;
+	}
+	
+	return cleared;
+}
+
+STATIC void bm_set_surplus(struct drbd_bitmap * b)
+{
+	const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1;
+	size_t w = b->bm_bits >> LN2_BPL;
+
+	if ( w < b->bm_words ) {
+		b->bm[w++] |= ~mask;
+	}
+
+	if ( w < b->bm_words ) {
+		b->bm[w++] = ~(0UL);
+	}
+}
+
+STATIC unsigned long bm_count_bits(struct drbd_bitmap * b)
+{
+	unsigned long *bm = b->bm;
+	unsigned long *ep = b->bm + b->bm_words;
+	unsigned long bits = 0;
+
+	while ( bm < ep ) {
+		bits += hweight_long(*bm++);
+	}
+
+	return bits;
+}
+
+#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if neccessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initiallized to all bits set.
+ */
+int drbd_bm_resize(drbd_dev *mdev, sector_t capacity)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long bits, bytes, words, *nbm, *obm = 0;
+	int err = 0, growing;
+
+	ERR_IF(!b) return -ENOMEM;
+	MUST_BE_LOCKED();
+
+	ERR_IF (down_trylock(&b->bm_change)) {
+		down(&b->bm_change);
+	}
+
+	if (capacity == b->bm_dev_capacity)
+		goto out;
+
+	if (capacity == 0) {
+		spin_lock_irq(&b->bm_lock);
+		obm = b->bm;
+		b->bm = NULL;
+		b->bm_fo    =
+		b->bm_set   =
+		b->bm_bits  =
+		b->bm_words =
+		b->bm_dev_capacity = 0;
+		spin_unlock_irq(&b->bm_lock);
+		goto free_obm;
+	} else {
+		bits  = ALIGN(capacity,BM_SECTORS_PER_BIT)
+		      >> (BM_BLOCK_SIZE_B-9);
+
+		/* if we would use
+		   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+		   a 32bit host could present the wrong number of words
+		   to a 64bit host.
+		*/
+		words = ALIGN(bits,64) >> LN2_BPL;
+
+		D_ASSERT(bits < ((MD_RESERVED_SIZE<<1)-MD_BM_OFFSET)<<12 );
+
+		if ( words == b->bm_words ) {
+			/* optimize: capacity has changed,
+			 * but only within one long word worth of bits.
+			 * just update the bm_dev_capacity and bm_bits members.
+			 */
+			spin_lock_irq(&b->bm_lock);
+			b->bm_bits    = bits;
+			b->bm_dev_capacity = capacity;
+			b->bm_set -= bm_clear_surplus(b);
+			bm_end_info(mdev, __FUNCTION__ );
+			spin_unlock_irq(&b->bm_lock);
+			goto out;
+		} else {
+		        /* one extra long to catch off by one errors */
+			bytes = (words+1)*sizeof(long);
+			nbm = vmalloc(bytes);
+			if (!nbm) {
+				err = -ENOMEM;
+				goto out;
+			}
+		}
+		spin_lock_irq(&b->bm_lock);
+		obm = b->bm;
+		// brgs. move several MB within spinlock...
+		if (obm) {
+			bm_set_surplus(b);
+			D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC);
+			memcpy(nbm,obm,min_t(size_t,b->bm_words,words)*sizeof(long));
+		}
+		growing = words > b->bm_words;
+		if (growing) { // set all newly allocated bits
+			memset( nbm+b->bm_words, -1,
+				(words - b->bm_words) * sizeof(long) );
+			b->bm_set  += bits - b->bm_bits;
+		}
+		nbm[words] = DRBD_MAGIC;
+		b->bm = nbm;
+		b->bm_bits  = bits;
+		b->bm_words = words;
+		b->bm_dev_capacity = capacity;
+		bm_clear_surplus(b);
+		if( !growing ) b->bm_set = bm_count_bits(b);
+		bm_end_info(mdev, __FUNCTION__ );
+		spin_unlock_irq(&b->bm_lock);
+		INFO("resync bitmap: bits=%lu words=%lu\n",bits,words);
+	}
+ free_obm:
+	vfree(obm); // vfree(NULL) is noop
+ out:
+	up(&b->bm_change);
+	return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+unsigned long drbd_bm_total_weight(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long s;
+	unsigned long flags;
+
+	ERR_IF(!b) return 0;
+	// MUST_BE_LOCKED(); well. yes. but ...
+
+	spin_lock_irqsave(&b->bm_lock,flags);
+	s = b->bm_set;
+	spin_unlock_irqrestore(&b->bm_lock,flags);
+
+	return s;
+}
+
+size_t drbd_bm_words(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return 0;
+
+	/* FIXME
+	 * actually yes. really. otherwise it could just change its size ...
+	 * but it triggers all the time...
+	 * MUST_BE_LOCKED();
+	 */
+
+	return b->bm_words;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ */
+void drbd_bm_merge_lel( drbd_dev *mdev, size_t offset, size_t number,
+			unsigned long* buffer )
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *bm;
+	unsigned long word, bits;
+	size_t n = number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm) return;
+	D_BUG_ON(offset        >= b->bm_words);
+	D_BUG_ON(offset+number >  b->bm_words);
+	D_BUG_ON(number > PAGE_SIZE/sizeof(long));
+
+	MUST_BE_LOCKED();
+
+	spin_lock_irq(&b->bm_lock);
+	// BM_PARANOIA_CHECK(); no.
+	bm = b->bm + offset;
+	while(n--) {
+		bits = hweight_long(*bm);
+		word = *bm | lel_to_cpu(*buffer++);
+		*bm++ = word;
+		b->bm_set += hweight_long(word) - bits;
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (offset+number == b->bm_words) {
+		b->bm_set -= bm_clear_surplus(b);
+		bm_end_info(mdev, __FUNCTION__ );
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ */
+void drbd_bm_set_lel( drbd_dev *mdev, size_t offset, size_t number,
+		      unsigned long* buffer )
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *bm;
+	unsigned long word, bits;
+	size_t n = number;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm) return;
+	D_BUG_ON(offset        >= b->bm_words);
+	D_BUG_ON(offset+number >  b->bm_words);
+	D_BUG_ON(number > PAGE_SIZE/sizeof(long));
+
+	MUST_BE_LOCKED();
+
+	spin_lock_irq(&b->bm_lock);
+	// BM_PARANOIA_CHECK(); no.
+	bm = b->bm + offset;
+	while(n--) {
+		bits = hweight_long(*bm);
+		word = lel_to_cpu(*buffer++);
+		*bm++ = word;
+		b->bm_set += hweight_long(word) - bits;
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (offset+number == b->bm_words) {
+		b->bm_set -= bm_clear_surplus(b);
+		bm_end_info(mdev, __FUNCTION__ );
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel( drbd_dev *mdev, size_t offset, size_t number,
+		      unsigned long* buffer )
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long *bm;
+
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm) return;
+	if ( (offset        >= b->bm_words) ||
+	     (offset+number >  b->bm_words) ||
+	     (number > PAGE_SIZE/sizeof(long)) ||
+	     (number <= 0) ) {
+		// yes, there is "%z", but that gives compiler warnings...
+		ERR("offset=%lu number=%lu bm_words=%lu\n",
+			(unsigned long)	offset,
+			(unsigned long)	number,
+			(unsigned long) b->bm_words);
+		return;
+	}
+
+	// MUST_BE_LOCKED(); yes. but not neccessarily globally...
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	bm = b->bm + offset;
+	while(number--) *buffer++ = cpu_to_lel(*bm++);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm) return;
+
+	MUST_BE_LOCKED();
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	memset(b->bm,-1,b->bm_words*sizeof(long));
+	bm_clear_surplus(b);
+	b->bm_set = b->bm_bits;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* read one sector of the on disk bitmap into memory.
+ * on disk bitmap is little endian.
+ * @enr is _sector_ offset from start of on disk bitmap (aka bm-extent nr).
+ * returns 0 on success, -EIO on failure
+ */
+int drbd_bm_read_sect(drbd_dev *mdev,unsigned long enr)
+{
+	sector_t on_disk_sector = enr + drbd_md_ss(mdev) + MD_BM_OFFSET;
+	int bm_words, num_words, offset, err  = 0;
+
+	// MUST_BE_LOCKED(); not neccessarily global ...
+
+	down(&mdev->md_io_mutex);
+	if(drbd_md_sync_page_io(mdev,on_disk_sector,READ)) {
+		bm_words  = drbd_bm_words(mdev);
+		offset    = S2W(enr);	// word offset into bitmap
+		num_words = min(S2W(1), bm_words - offset);
+#if DUMP_MD >= 3
+	INFO("read_sect: sector=%lu offset=%u num_words=%u\n",
+			enr, offset, num_words);
+#endif
+		drbd_bm_set_lel( mdev, offset, num_words,
+				 page_address(mdev->md_io_page) );
+	} else {
+		int i;
+		err = -EIO;
+		ERR( "IO ERROR reading bitmap sector %lu "
+		     "(meta-disk sector %lu)\n",
+		     enr, (unsigned long)on_disk_sector );
+		drbd_chk_io_error(mdev, 1);
+		drbd_io_error(mdev);
+		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
+			drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i);
+	}
+	up(&mdev->md_io_mutex);
+	return err;
+}
+
+/**
+ * drbd_bm_read: Read the whole bitmap from its on disk location.
+ */
+void drbd_bm_read(struct Drbd_Conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	sector_t sector;
+	int bm_words, num_sectors;
+	char ppb[10];
+
+	MUST_BE_LOCKED();
+
+	bm_words    = drbd_bm_words(mdev);
+	num_sectors = (bm_words*sizeof(long) + 511) >> 9;
+
+	for (sector = 0; sector < num_sectors; sector++) {
+		// FIXME do something on io error here?
+		drbd_bm_read_sect(mdev,sector);
+	}
+
+	INFO("%s marked out-of-sync by on disk bit-map.\n",
+	     ppsize(ppb,drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10)) );
+}
+
+/**
+ * drbd_bm_write_sect: Writes a 512 byte piece of the bitmap to its
+ * on disk location. On disk bitmap is little endian.
+ *
+ * @enr: The _sector_ offset from the start of the bitmap.
+ *
+ */
+int drbd_bm_write_sect(struct Drbd_Conf *mdev,unsigned long enr)
+{
+	sector_t on_disk_sector = enr + drbd_md_ss(mdev) + MD_BM_OFFSET;
+	int bm_words, num_words, offset, err  = 0;
+
+	// MUST_BE_LOCKED(); not neccessarily global...
+
+	down(&mdev->md_io_mutex);
+	bm_words  = drbd_bm_words(mdev);
+	offset    = S2W(enr);	// word offset into bitmap
+	num_words = min(S2W(1), bm_words - offset);
+#if DUMP_MD >= 3
+	INFO("write_sect: sector=%lu offset=%u num_words=%u\n",
+			enr, offset, num_words);
+#endif
+	if (num_words < S2W(1)) {
+		memset(page_address(mdev->md_io_page),0,MD_HARDSECT);
+	}
+	drbd_bm_get_lel( mdev, offset, num_words,
+			 page_address(mdev->md_io_page) );
+	if (!drbd_md_sync_page_io(mdev,on_disk_sector,WRITE)) {
+		int i;
+		err = -EIO;
+		ERR( "IO ERROR writing bitmap sector %lu "
+		     "(meta-disk sector %lu)\n",
+		     enr, (unsigned long)on_disk_sector );
+		drbd_chk_io_error(mdev, 1);
+		drbd_io_error(mdev);
+		for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
+			drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i);
+	}
+	mdev->bm_writ_cnt++;
+	up(&mdev->md_io_mutex);
+	return err;
+}
+
+/**
+ * drbd_bm_write: Write the whole bitmap to its on disk location.
+ */
+void drbd_bm_write(struct Drbd_Conf *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	sector_t sector;
+	int bm_words, num_sectors;
+
+	MUST_BE_LOCKED();
+
+	bm_words    = drbd_bm_words(mdev);
+	num_sectors = (bm_words*sizeof(long) + 511) >> 9;
+
+	for (sector = 0; sector < num_sectors; sector++) {
+		// FIXME do something on io error here?
+		drbd_bm_write_sect(mdev,sector);
+	}
+
+	INFO("%lu KB now marked out-of-sync by on disk bit-map.\n",
+	      drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10) );
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+	ERR_IF(!b->bm) return;
+
+	MUST_BE_LOCKED();						\
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	memset(b->bm,0,b->bm_words*sizeof(long));
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+void drbd_bm_reset_find(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	ERR_IF(!b) return;
+
+	MUST_BE_LOCKED();
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	b->bm_fo = 0;
+	spin_unlock_irq(&b->bm_lock);
+
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * should not make much difference anyways, but ...
+ * this returns a bit number, NOT a sector!
+ */
+unsigned long drbd_bm_find_next(drbd_dev *mdev)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long i = -1UL;
+	ERR_IF(!b) return i;
+	ERR_IF(!b->bm) return i;
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	if (b->bm_fo < b->bm_bits) {
+		i = find_next_bit(b->bm,b->bm_bits,b->bm_fo);
+	} else if (b->bm_fo > b->bm_bits) {
+		ERR("bm_fo=%lu bm_bits=%lu\n",b->bm_fo, b->bm_bits);
+	}
+	if (i >= b->bm_bits) {
+		i = -1UL;
+		b->bm_fo = 0;
+	} else {
+		b->bm_fo = i+1;
+	}
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+int drbd_bm_rs_done(drbd_dev *mdev)
+{
+	return mdev->bitmap->bm_fo == 0;
+}
+
+// THINK maybe the D_BUG_ON(i<0)s in set/clear/test should be not that strict?
+
+/* returns previous bit state
+ * wants bitnr, NOT sector.
+ */
+int drbd_bm_set_bit(drbd_dev *mdev, const unsigned long bitnr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int i;
+	ERR_IF(!b) return 1;
+	ERR_IF(!b->bm) return 1;
+
+/*
+ * only called from drbd_set_out_of_sync.
+ * strange_state blubber is already in place there...
+	strange_state = ( mdev->cstate  > Connected ) ||
+	                ( mdev->cstate == Connected &&
+	                 !(test_bit(DISKLESS,&mdev->flags) ||
+	                   test_bit(PARTNER_DISKLESS,&mdev->flags)) );
+	if (strange_state)
+		ERR("%s in drbd_bm_set_bit\n", cstate_to_name(mdev->cstate));
+*/
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	MUST_NOT_BE_LOCKED();
+	ERR_IF (bitnr >= b->bm_bits) {
+		ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits);
+		i = 0;
+	} else {
+		i = (0 != __test_and_set_bit(bitnr, b->bm));
+		b->bm_set += !i;
+	}
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+/* returns previous bit state
+ * wants bitnr, NOT sector.
+ */
+int drbd_bm_clear_bit(drbd_dev *mdev, const unsigned long bitnr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long flags;
+	int i;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm) return 0;
+
+	spin_lock_irqsave(&b->bm_lock,flags);
+	BM_PARANOIA_CHECK();
+	MUST_NOT_BE_LOCKED();
+	ERR_IF (bitnr >= b->bm_bits) {
+		ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits);
+		i = 0;
+	} else {
+		i = (0 != __test_and_clear_bit(bitnr, b->bm));
+		b->bm_set -= i;
+	}
+	spin_unlock_irqrestore(&b->bm_lock,flags);
+
+	/* clearing bits should only take place when sync is in progress!
+	 * this is only called from drbd_set_in_sync.
+	 * strange_state blubber is already in place there ...
+	if (i && mdev->cstate <= Connected)
+		ERR("drbd_bm_clear_bit: cleared a bitnr=%lu while %s\n",
+				bitnr, cstate_to_name(mdev->cstate));
+	 */
+
+	return i;
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ */
+int drbd_bm_test_bit(drbd_dev *mdev, const unsigned long bitnr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int i;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm) return 0;
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	ERR_IF (bitnr >= b->bm_bits) {
+		ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits);
+		i = 0;
+	} else {
+		i = test_bit(bitnr, b->bm);
+	}
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(drbd_dev *mdev, unsigned long enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	int count, s, e;
+	unsigned long flags;
+
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm) return 0;
+	spin_lock_irqsave(&b->bm_lock,flags);
+	BM_PARANOIA_CHECK();
+
+	s = S2W(enr);
+	e = min((size_t)S2W(enr+1),b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		const unsigned long* w = b->bm+s;
+		int n = e-s;
+		while (n--) count += hweight_long(*w++);
+	} else {
+		ERR("start offset (%d) too large in drbd_bm_e_weight\n", s);
+	}
+	spin_unlock_irqrestore(&b->bm_lock,flags);
+#if DUMP_MD >= 3
+	INFO("enr=%lu weight=%d e=%d s=%d\n", enr, count, e, s);
+#endif
+	return count;
+}
+
+/* set all bits covered by the AL-extent al_enr */
+unsigned long drbd_bm_ALe_set_all(drbd_dev *mdev, unsigned long al_enr)
+{
+	struct drbd_bitmap *b = mdev->bitmap;
+	unsigned long weight;
+	int count, s, e;
+	ERR_IF(!b) return 0;
+	ERR_IF(!b->bm) return 0;
+
+	MUST_BE_LOCKED();
+
+	spin_lock_irq(&b->bm_lock);
+	BM_PARANOIA_CHECK();
+	weight = b->bm_set;
+
+	s = al_enr * BM_WORDS_PER_AL_EXT;
+	e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		const unsigned long* w = b->bm+s;
+		int n = e-s;
+		while (n--) count += hweight_long(*w++);
+		n = e-s;
+		memset(b->bm+s,-1,n*sizeof(long));
+		b->bm_set += n*BITS_PER_LONG - count;
+		if (e == b->bm_words) {
+			b->bm_set -= bm_clear_surplus(b);
+		}
+	} else {
+		ERR("start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
+	}
+	weight = b->bm_set - weight;
+	spin_unlock_irq(&b->bm_lock);
+	return weight;
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_buildtag.c	2006-02-13 17:56:44.000000000 +0300
@@ -0,0 +1,6 @@
+/* automatically generated. DO NOT EDIT. */
+const char * drbd_buildtag(void)
+{
+	return "SVN Revision: 2066"
+		" build by phil@mescal, 2006-02-13 15:57:14";
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_compat_types.h	2005-08-24 18:45:04.000000000 +0400
@@ -0,0 +1,324 @@
+
+// currently only abstraction layer to get all references to buffer_head
+// and b_some_thing out of our .c files.
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+#include <linux/highmem.h>
+
+typedef struct buffer_head drbd_bio_t;
+typedef unsigned long sector_t;
+
+#define NOT_IN_26(x...)		x
+#define ONLY_IN_26(x...)
+
+#if !defined(CONFIG_HIGHMEM) && !defined(bh_kmap)
+#define bh_kmap(bh)	((bh)->b_data)
+#define bh_kunmap(bh)	do { } while (0)
+#endif
+
+#ifndef list_for_each
+#define list_for_each(pos, head) \
+	for(pos = (head)->next; pos != (head); pos = pos->next)
+#endif
+
+// RH 2.4.9 does not have min() / max()
+#ifndef min
+# define min(x,y) \
+	({ typeof(x) __x = (x); typeof(y) __y = (y); \
+	   (void)(&__x == &__y); \
+	   __x < __y ? __x: __y; })
+#endif
+
+#ifndef max
+# define max(x,y) \
+	({ typeof(x) __x = (x); typeof(y) __y = (y); \
+	   (void)(&__x == &__y); \
+	   __x > __y ? __x: __y; })
+#endif
+
+#ifndef MODULE_LICENSE
+# define MODULE_LICENSE(L)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,10)
+#define min_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#define max_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,7)
+#define completion semaphore
+#define init_completion(A) init_MUTEX_LOCKED(A)
+#define wait_for_completion(A) down(A)
+#define complete(A) up(A)
+#else
+#include <linux/completion.h>
+#endif
+
+/* note that if you use some verndor kernels like SuSE,
+ * their 2.4.X variant probably already contain equivalent definitions.
+ * you then have to disable this compat again...
+ */
+
+#ifndef HAVE_FIND_NEXT_BIT /* { */
+
+#if defined(__i386__) || defined(__arch_um__)
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+static __inline__ int find_first_bit(const unsigned long *addr, unsigned size)
+{
+        int d0, d1;
+        int res;
+
+        /* This looks at memory. Mark it volatile to tell gcc not to move it around */
+        __asm__ __volatile__(
+                "xorl %%eax,%%eax\n\t"
+                "repe; scasl\n\t"
+                "jz 1f\n\t"
+                "leal -4(%%edi),%%edi\n\t"
+                "bsfl (%%edi),%%eax\n"
+                "1:\tsubl %%ebx,%%edi\n\t"
+                "shll $3,%%edi\n\t"
+                "addl %%edi,%%eax"
+                :"=a" (res), "=&c" (d0), "=&D" (d1)
+                :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
+        return res;
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+
+static __inline__ int find_next_bit(const unsigned long *addr, int size, int offset)
+{
+        const unsigned long *p = addr + (offset >> 5);
+        int set = 0, bit = offset & 31, res;
+
+        if (bit) {
+                /*
+                 * Look for nonzero in the first 32 bits:
+                 */
+                __asm__("bsfl %1,%0\n\t"
+                        "jne 1f\n\t"
+                        "movl $32, %0\n"
+                        "1:"
+                        : "=r" (set)
+                        : "r" (*p >> bit));
+                if (set < (32 - bit))
+                        return set + offset;
+                set = 32 - bit;
+                p++;
+        }
+        /*
+         * No set bit yet, search remaining full words for a bit
+         */
+        res = find_first_bit (p, size - 32 * (p - addr));
+        return (offset + set + res);
+}
+
+#elif defined(__x86_64__)
+
+static __inline__ int find_first_bit(const unsigned long * addr, unsigned size)
+{
+	int d0, d1;
+	int res;
+
+	/* This looks at memory. Mark it volatile to tell gcc not to move it around */
+	__asm__ __volatile__(
+		"xorl %%eax,%%eax\n\t"
+		"repe; scasl\n\t"
+		"jz 1f\n\t"
+		"leaq -4(%%rdi),%%rdi\n\t"
+		"bsfl (%%rdi),%%eax\n"
+		"1:\tsubq %%rbx,%%rdi\n\t"
+		"shll $3,%%edi\n\t"
+		"addl %%edi,%%eax"
+		:"=a" (res), "=&c" (d0), "=&D" (d1)
+		:"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory");
+	return res;
+}
+
+static __inline__ int find_next_bit(const unsigned long * addr, int size, int offset)
+{
+	const unsigned long * p = addr + (offset >> 6);
+	unsigned long set = 0, bit = offset & 63, res;
+	
+	if (bit) {
+		/*
+		 * Look for nonzero in the first 64 bits:
+		 */
+		__asm__("bsfq %1,%0\n\t"
+			"cmoveq %2,%0\n\t"
+			: "=r" (set)
+			: "r" (*p >> bit), "r" (64L));
+		if (set < (64 - bit))
+			return set + offset;
+		set = 64 - bit;
+		p++;
+	}
+	/*
+	 * No set bit yet, search remaining full words for a bit
+	 */
+	res = find_first_bit (p, size - 64 * (p - addr));
+	return (offset + set + res);
+}
+
+#elif defined(__alpha__)
+
+#include <asm/compiler.h>
+#if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3
+# define __kernel_cmpbge(a, b)          __builtin_alpha_cmpbge(a, b)
+#else
+# define __kernel_cmpbge(a, b)                                          \
+  ({ unsigned long __kir;                                               \
+     __asm__("cmpbge %r2,%1,%0" : "=r"(__kir) : "rI"(b), "rJ"(a));      \
+     __kir; })
+#endif
+
+static inline unsigned long __ffs(unsigned long word)
+{
+#if defined(__alpha_cix__) && defined(__alpha_fix__)
+	/* Whee.  EV67 can calculate it directly.  */
+	return __kernel_cttz(word);
+#else
+	unsigned long bits, qofs, bofs;
+
+	bits = __kernel_cmpbge(0, word);
+	qofs = ffz_b(bits);
+	bits = __kernel_extbl(word, qofs);
+	bofs = ffz_b(~bits);
+
+	return qofs*8 + bofs;
+#endif
+}
+
+static inline unsigned long
+find_next_bit(void * addr, unsigned long size, unsigned long offset)
+{
+	unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
+	unsigned long result = offset & ~63UL;
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= 63UL;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= ~0UL << offset;
+		if (size < 64)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= 64;
+		result += 64;
+	}
+	while (size & ~63UL) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += 64;
+		size -= 64;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+ found_first:
+	tmp &= ~0UL >> (64 - size);
+	if (!tmp)
+		return result + size;
+ found_middle:
+	return result + __ffs(tmp);
+}
+#elif defined(USE_GENERIC_FIND_NEXT_BIT)
+
+#if BITS_PER_LONG == 32
+#define  _xFFFF 31ul
+#define _x10000 32
+#define _xSHIFT  5
+#elif BITS_PER_LONG == 64
+#define  _xFFFF 63ul
+#define _x10000 64
+#define _xSHIFT  6
+#else
+#error "Unexpected BITS_PER_LONG"
+#endif
+
+/* slightly large to be inlined, but anyways... */
+static inline unsigned long
+find_next_bit(void * addr, unsigned long size, unsigned long offset)
+{
+	unsigned long * p = ((unsigned long *) addr) + (offset >> _xSHIFT);
+	unsigned long result = offset & ~_xFFFF;
+	unsigned long tmp;
+
+	if (offset >= size)
+		return size;
+	size -= result;
+	offset &= _xFFFF;
+	if (offset) {
+		tmp = *(p++);
+		tmp &= ~0UL << offset;
+		if (size < _x10000)
+			goto found_first;
+		if (tmp)
+			goto found_middle;
+		size -= _x10000;
+		result += _x10000;
+	}
+	while (size & ~_xFFFF) {
+		if ((tmp = *(p++)))
+			goto found_middle;
+		result += _x10000;
+		size -= _x10000;
+	}
+	if (!size)
+		return result;
+	tmp = *p;
+ found_first:
+	tmp &= ~0UL >> (_x10000 - size);
+	if (!tmp)
+		return result + size;
+ found_middle: /* if this is reached, we know that (tmp != 0) */
+	return result + generic_ffs(tmp)-1;
+}
+
+#undef _xFFFF
+#undef _x10000
+#undef _xSHIFT
+
+#elif !defined(__powerpc64__) /* ppc64 is taken care of, see drbd_bitmap.c */
+#warning "You probably need to copy find_next_bit() from a 2.6.x kernel."
+#warning "Or enable low performance generic C-code"
+#warning "(USE_GENERIC_FIND_NEXT_BIT in drbd_config.h)"
+#endif
+
+#endif /* HAVE_FIND_NEXT_BIT } */
+
+#ifndef ALIGN
+#define ALIGN(x,a) ( ((x) + (a)-1) &~ ((a)-1) )
+#endif
+
+#ifndef BUG_ON
+#define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0)
+#endif
+
+#else // LINUX 2.6
+
+typedef struct bio drbd_bio_t;
+
+#define SIGHAND_HACK
+
+#define NOT_IN_26(x...)
+#define ONLY_IN_26(x...)	x
+
+#endif
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_compat_wrappers.h	2005-08-16 16:32:40.000000000 +0400
@@ -0,0 +1,653 @@
+// currently only abstraction layer to get all references to buffer_head
+// and b_some_thing out of our .c files.
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+
+#define __module_get  __MOD_INC_USE_COUNT
+#define   module_put  __MOD_DEC_USE_COUNT
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)
+/*
+ * dump_stack() showed up in 2.4.20.
+ * show_stack is arch-specific
+ * The architecture-independent backtrace generator
+ */
+static inline void dump_stack(void)
+{
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,18)
+	// It seems that before 2.4.18 even show_stack is not available.
+        show_stack(0);
+#endif
+}
+#endif
+
+// b_end_io handlers
+extern void drbd_md_io_complete     (struct buffer_head *bh, int uptodate);
+extern void enslaved_read_bi_end_io (struct buffer_head *bh, int uptodate);
+extern void drbd_dio_end_sec        (struct buffer_head *bh, int uptodate);
+extern void drbd_dio_end            (struct buffer_head *bh, int uptodate);
+extern void drbd_read_bi_end_io     (struct buffer_head *bh, int uptodate);
+
+/*
+ * because in 2.6.x [sg]et_capacity operate on gendisk->capacity, which is in
+ * units of 512 bytes sectors, these wrappers have a <<1 or >>1 where
+ * appropriate.
+ */
+
+static inline sector_t drbd_get_hardsect(kdev_t dev)
+{
+	return hardsect_size[MAJOR(dev)] ?
+		hardsect_size[MAJOR(dev)][MINOR(dev)] : 512;
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(kdev_t dev)
+{
+	return dev ? blk_size[MAJOR(dev)][MINOR(dev)]<<1 : 0;
+}
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(drbd_dev *mdev, sector_t size)
+{
+	blk_size[MAJOR_NR][(int)(mdev - drbd_conf)] = (size>>1);
+}
+
+//#warning "FIXME why don't we care for the return value?"
+static inline void drbd_set_blocksize(drbd_dev *mdev, int blksize)
+{
+	set_blocksize(mdev->this_bdev, blksize);
+	if (mdev->backing_bdev)
+		set_blocksize(mdev->backing_bdev, blksize);
+	else D_ASSERT(mdev->backing_bdev);
+}
+
+static inline int drbd_sync_me(drbd_dev *mdev)
+{
+	return fsync_dev(mdev->this_bdev);
+}
+
+#define drbd_bio_uptodate(bio) buffer_uptodate(bio)
+
+static inline void drbd_bio_IO_error(struct buffer_head *bh)
+{
+	buffer_IO_error(bh);
+}
+
+static inline void drbd_bio_endio(struct buffer_head *bh, int uptodate)
+{
+	bh->b_end_io(bh,uptodate);
+}
+
+static inline drbd_dev* drbd_req_get_mdev(struct drbd_request *req)
+{
+	return (drbd_dev*) req->private_bio.b_private;
+}
+
+static inline sector_t drbd_req_get_sector(struct drbd_request *req)
+{
+	return req->private_bio.b_blocknr;
+}
+
+static inline unsigned short drbd_req_get_size(struct drbd_request *req)
+{
+	return req->private_bio.b_size;
+}
+
+static inline drbd_bio_t* drbd_req_private_bio(struct drbd_request *req)
+{
+	return &req->private_bio;
+}
+
+static inline sector_t drbd_ee_get_sector(struct Tl_epoch_entry *ee)
+{
+	return ee->private_bio.b_blocknr;
+}
+
+static inline unsigned short drbd_ee_get_size(struct Tl_epoch_entry *ee)
+{
+	return ee->private_bio.b_size;
+}
+
+static inline char *drbd_bio_kmap(struct buffer_head *bh)
+{
+	return bh_kmap(bh);
+}
+
+static inline void drbd_bio_kunmap(struct buffer_head *bh)
+{
+	bh_kunmap(bh);
+}
+
+static inline void drbd_ee_init(struct Tl_epoch_entry *e,struct page *page)
+{
+	struct buffer_head * const bh = &e->private_bio;
+	memset(e, 0, sizeof(*e));
+
+	// bh->b_list   = BUF_LOCKED; // does it matter?
+	bh->b_size      = PAGE_SIZE;
+	bh->b_this_page = bh;
+	bh->b_state     = (1 << BH_Mapped);
+	init_waitqueue_head(&bh->b_wait);
+	set_bh_page(bh,page,0);
+	atomic_set(&bh->b_count, 1);
+
+	e->block_id = ID_VACANT;
+}
+
+static inline void drbd_bio_set_pages_dirty(struct buffer_head *bh)
+{
+	set_bit(BH_Dirty, &bh->b_state);
+}
+
+static inline void drbd_bio_set_end_io(struct buffer_head *bh, bh_end_io_t * h)
+{
+	bh->b_end_io = h;
+}
+
+static inline void
+drbd_ee_bh_prepare(drbd_dev *mdev, struct buffer_head *bh,
+		   sector_t sector, int size)
+{
+	D_ASSERT(mdev->backing_bdev);
+
+	bh->b_blocknr  = sector;	// We abuse b_blocknr here.
+	bh->b_size     = size;
+	bh->b_rsector  = sector;
+	bh->b_rdev     = mdev->backing_bdev;
+	bh->b_private  = mdev;
+	bh->b_state    = (1 << BH_Req)
+	                |(1 << BH_Mapped)
+			|(1 << BH_Lock);
+}
+
+static inline void
+drbd_ee_prepare_write(drbd_dev *mdev, struct Tl_epoch_entry* e,
+		      sector_t sector, int size)
+{
+	struct buffer_head * const bh = &e->private_bio;
+
+	drbd_ee_bh_prepare(mdev,bh,sector,size);
+	set_bit(BH_Uptodate,&bh->b_state);
+	set_bit(BH_Dirty,&bh->b_state);
+	bh->b_end_io   = drbd_dio_end_sec;
+}
+
+static inline void
+drbd_ee_prepare_read(drbd_dev *mdev, struct Tl_epoch_entry* e,
+		     sector_t sector, int size)
+{
+	struct buffer_head * const bh = &e->private_bio;
+
+	drbd_ee_bh_prepare(mdev,bh,sector,size);
+	bh->b_end_io   = enslaved_read_bi_end_io;
+}
+
+static inline void
+drbd_bh_clone(struct buffer_head *bh, struct buffer_head *bh_src)
+{
+	memset(bh,0,sizeof(*bh));
+	bh->b_list    = bh_src->b_list; // BUF_LOCKED;
+	bh->b_size    = bh_src->b_size;
+	bh->b_state   = bh_src->b_state & ((1 << BH_PrivateStart)-1);
+	bh->b_page    = bh_src->b_page;
+	bh->b_data    = bh_src->b_data;
+	bh->b_rsector = bh_src->b_rsector;
+	bh->b_blocknr = bh_src->b_rsector; // We abuse b_blocknr here.
+	bh->b_dev     = bh_src->b_dev;     // hint for LVM as to
+					   // which device to call fsync_dev
+					   // on for snapshots
+	atomic_set(&bh->b_count, 1);
+	init_waitqueue_head(&bh->b_wait);
+	// other members stay NULL
+}
+
+static inline void
+drbd_req_prepare_write(drbd_dev *mdev, struct drbd_request *req)
+{
+	struct buffer_head * const bh     = &req->private_bio;
+	struct buffer_head * const bh_src =  req->master_bio;
+
+	drbd_bh_clone(bh,bh_src);
+	bh->b_rdev    = mdev->backing_bdev;
+	bh->b_private = mdev;
+	bh->b_end_io  = drbd_dio_end;
+
+	D_ASSERT(buffer_req(bh));
+	D_ASSERT(buffer_locked(bh));
+	D_ASSERT(buffer_mapped(bh));
+	// D_ASSERT(buffer_dirty(bh)); // It is not true ?!?
+	/* kupdated keeps submitting "non-uptodate" buffers.
+	ERR_IF (!buffer_uptodate(bh)) {
+		ERR("[%s/%d]: bh_src->b_state=%lx bh->b_state=%lx\n",
+		    current->comm, current->pid,
+		    bh_src->b_state, bh->b_state);
+	};
+	*/
+
+	// FIXME should not be necessary;
+	// remove if the assertions above do not trigger.
+	bh->b_state = (1 << BH_Uptodate)
+		     |(1 << BH_Dirty)
+		     |(1 << BH_Lock)
+		     |(1 << BH_Req)
+		     |(1 << BH_Mapped) ;
+
+	req->rq_status = RQ_DRBD_NOTHING;
+}
+
+static inline void
+drbd_req_prepare_read(drbd_dev *mdev, struct drbd_request *req)
+{
+	struct buffer_head * const bh     = &req->private_bio;
+	struct buffer_head * const bh_src =  req->master_bio;
+
+	drbd_bh_clone(bh,bh_src);
+	bh->b_rdev    = mdev->backing_bdev;
+	bh->b_private = mdev;
+	bh->b_end_io  = drbd_read_bi_end_io;
+
+	D_ASSERT(buffer_req(bh));
+	D_ASSERT(buffer_locked(bh));
+	D_ASSERT(buffer_mapped(bh));
+	D_ASSERT(!buffer_uptodate(bh));
+
+	// FIXME should not be necessary;
+	// remove if the assertions above do not trigger.
+	bh->b_state = (1 << BH_Lock)
+		     |(1 << BH_Req)
+		     |(1 << BH_Mapped) ;
+
+	req->rq_status = RQ_DRBD_NOTHING;
+}
+
+static inline struct page* drbd_bio_get_page(struct buffer_head *bh)
+{
+	return bh->b_page;
+}
+
+static inline void drbd_generic_make_request(int rw, struct buffer_head *bh)
+{
+	drbd_dev *mdev = drbd_conf -1 ;
+
+	if (!bh->b_rdev) {
+		if (DRBD_ratelimit(5*HZ,5)) {
+			printk(KERN_ERR "drbd_generic_make_request: bh->b_rdev == NULL\n");
+			dump_stack();
+		}
+		drbd_bio_IO_error(bh);
+		return;
+	}
+
+	generic_make_request(rw, bh);
+}
+
+static inline void drbd_kick_lo(drbd_dev *mdev)
+{
+	run_task_queue(&tq_disk);
+}
+
+static inline void drbd_plug_device(drbd_dev *mdev)
+{
+	D_ASSERT(mdev->state == Primary);
+	if (mdev->cstate < Connected)
+		return;
+	if (!test_and_set_bit(UNPLUG_QUEUED,&mdev->flags)) {
+		/* if it could not be queued, clear our flag again, too */
+		if (!queue_task(&mdev->write_hint_tq, &tq_disk))
+			clear_bit(UNPLUG_QUEUED,&mdev->flags);
+	}
+}
+
+/* for increased performance,
+ * we try to use zero copy network send whenever possible.
+ *
+ * maybe TODO:
+ * find out whether we can use zero copy network recv, too, somehow.
+ * we'd need to define some sk_read_actor_t, and then use
+ * tcp_read_sock ...
+ */
+static inline int _drbd_send_zc_bio(drbd_dev *mdev, struct buffer_head *bh)
+{
+	struct page *page = bh->b_page;
+	size_t size = bh->b_size;
+
+	return _drbd_send_page(mdev,page,bh_offset(bh),size);
+}
+
+/* for proto A, we cannot use zero copy network send:
+ * we don't want to "ack" a send when we put a reference to it on the socket,
+ * but when it actually has reached the sendbuffer (so is likely to actually be
+ * on the wire in a couple of jiffies).
+ */
+static inline int _drbd_send_bio(drbd_dev *mdev, struct buffer_head *bh)
+{
+	size_t size = bh->b_size;
+	int ret;
+
+	ret = drbd_send(mdev, mdev->data.socket, bh_kmap(bh), size, 0);
+	bh_kunmap(bh);
+	return ret;
+}
+
+#else
+// LINUX_VERSION_CODE > 2,5,0
+
+#include <linux/buffer_head.h> // for fsync_bdev
+
+/* see get_sb_bdev and bd_claim */
+extern char* drbd_sec_holder;
+
+// bi_end_io handlers
+// int (bio_end_io_t) (struct bio *, unsigned int, int);
+extern int drbd_md_io_complete     (struct bio *bio, unsigned int bytes_done, int error);
+extern int enslaved_read_bi_end_io (struct bio *bio, unsigned int bytes_done, int error);
+extern int drbd_dio_end_sec        (struct bio *bio, unsigned int bytes_done, int error);
+extern int drbd_dio_end            (struct bio *bio, unsigned int bytes_done, int error);
+extern int drbd_read_bi_end_io     (struct bio *bio, unsigned int bytes_done, int error);
+
+static inline sector_t drbd_get_hardsect(struct block_device *bdev)
+{
+	return bdev->bd_disk->queue->hardsect_size;
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+	return bdev ? bdev->bd_inode->i_size >> 9 : 0;
+}
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(drbd_dev *mdev, sector_t size)
+{
+	set_capacity(mdev->vdisk,size);
+	mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+//#warning "FIXME why don't we care for the return value?"
+static inline void drbd_set_blocksize(drbd_dev *mdev, int blksize)
+{
+	set_blocksize(mdev->this_bdev,blksize);
+	if (mdev->backing_bdev) {
+		set_blocksize(mdev->backing_bdev, blksize);
+	} else {
+		D_ASSERT(mdev->backing_bdev);
+		// FIXME send some package over to the peer?
+	}
+}
+
+static inline int drbd_sync_me(drbd_dev *mdev)
+{
+	return fsync_bdev(mdev->this_bdev);
+}
+
+#define drbd_bio_uptodate(bio) bio_flagged(bio,BIO_UPTODATE)
+
+static inline void drbd_bio_IO_error(struct bio *bio)
+{
+	bio_endio(bio,bio->bi_size,-EIO);
+}
+
+static inline void drbd_bio_endio(struct bio *bio, int uptodate)
+{
+	bio_endio(bio,bio->bi_size,uptodate ? 0 : -EIO);
+}
+
+static inline drbd_dev* drbd_req_get_mdev(struct drbd_request *req)
+{
+	return (drbd_dev*) req->mdev;
+}
+
+static inline sector_t drbd_req_get_sector(struct drbd_request *req)
+{
+	return req->master_bio->bi_sector;
+}
+
+static inline unsigned short drbd_req_get_size(struct drbd_request *req)
+{
+	drbd_dev* mdev = req->mdev;
+	D_ASSERT(req->master_bio->bi_size);
+	return req->master_bio->bi_size;
+}
+
+static inline drbd_bio_t* drbd_req_private_bio(struct drbd_request *req)
+{
+	return req->private_bio;
+}
+
+static inline sector_t drbd_ee_get_sector(struct Tl_epoch_entry *ee)
+{
+	return ee->ee_sector;
+}
+
+static inline unsigned short drbd_ee_get_size(struct Tl_epoch_entry *ee)
+{
+	return ee->ee_size;
+}
+
+#ifdef CONFIG_HIGHMEM
+/*
+ * I don't know why there is no bvec_kmap, only bvec_kmap_irq ...
+ *
+ * we do a sock_recvmsg into the target buffer,
+ * so we obviously cannot use the bvec_kmap_irq variant.	-lge
+ *
+ * Most likely it is only due to performance anyways:
+  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+  * no global lock is needed and because the kmap code must perform a global TLB
+  * invalidation when the kmap pool wraps.
+  *
+  * However when holding an atomic kmap is is not legal to sleep, so atomic
+  * kmaps are appropriate for short, tight code paths only.
+ */
+static inline char *drbd_bio_kmap(struct bio *bio)
+{
+	struct bio_vec *bvec = bio_iovec(bio);
+	unsigned long addr;
+
+	addr = (unsigned long) kmap(bvec->bv_page);
+
+	if (addr & ~PAGE_MASK)
+		BUG();
+
+	return (char *) addr + bvec->bv_offset;
+}
+
+static inline void drbd_bio_kunmap(struct bio *bio)
+{
+	struct bio_vec *bvec = bio_iovec(bio);
+
+	kunmap(bvec->bv_page);
+}
+
+#else
+static inline char *drbd_bio_kmap(struct bio *bio)
+{
+	struct bio_vec *bvec = bio_iovec(bio);
+	return page_address(bvec->bv_page) + bvec->bv_offset;
+}
+static inline void drbd_bio_kunmap(struct bio *bio)
+{
+	// do nothing.
+}
+#endif
+
+static inline void drbd_ee_init(struct Tl_epoch_entry *e,struct page *page)
+{
+	struct bio * const bio = &e->private_bio;
+	struct bio_vec * const vec = &e->ee_bvec;
+
+	memset(e, 0, sizeof(*e));
+	bio_init(bio);
+
+	bio->bi_io_vec = vec;
+	bio->bi_destructor = NULL;
+	vec->bv_page = page;
+	bio->bi_size = vec->bv_len = PAGE_SIZE;
+	bio->bi_max_vecs = bio->bi_vcnt = 1;
+	vec->bv_offset = 0;
+
+	e->block_id = ID_VACANT;
+}
+
+static inline void drbd_bio_set_pages_dirty(struct bio *bio)
+{
+	bio_set_pages_dirty(bio);
+}
+
+static inline void drbd_bio_set_end_io(struct bio *bio, bio_end_io_t * h)
+{
+	bio->bi_end_io = h;
+}
+
+static inline void
+drbd_ee_bio_prepare(drbd_dev *mdev, struct Tl_epoch_entry* e,
+		    sector_t sector, int size)
+{
+	struct bio * const bio = &e->private_bio;
+	struct bio_vec * const vec = &e->ee_bvec;
+	struct page * const page = vec->bv_page;
+	D_ASSERT(mdev->backing_bdev);
+
+	/* Clear plate. */
+	bio_init(bio);
+
+	bio->bi_io_vec = vec;
+	bio->bi_destructor = NULL;
+	vec->bv_page = page;
+	vec->bv_offset = 0;
+	bio->bi_max_vecs = bio->bi_vcnt = 1;
+
+	bio->bi_bdev = mdev->backing_bdev;
+	bio->bi_private = mdev;
+
+	e->ee_sector = bio->bi_sector = sector;
+	e->ee_size = bio->bi_size = bio->bi_io_vec->bv_len = size;
+}
+
+static inline void
+drbd_ee_prepare_write(drbd_dev *mdev, struct Tl_epoch_entry* e,
+		      sector_t sector, int size)
+{
+	drbd_ee_bio_prepare(mdev,e,sector,size);
+	e->private_bio.bi_end_io = drbd_dio_end_sec;
+}
+
+static inline void
+drbd_ee_prepare_read(drbd_dev *mdev, struct Tl_epoch_entry* e,
+		     sector_t sector, int size)
+{
+	drbd_ee_bio_prepare(mdev,e,sector,size);
+	e->private_bio.bi_end_io = enslaved_read_bi_end_io;
+}
+
+static inline void
+drbd_req_prepare_write(drbd_dev *mdev, struct drbd_request *req)
+{
+	struct bio *bio;
+
+	bio = req->private_bio = bio_clone(req->master_bio, GFP_NOIO );
+	bio->bi_bdev    = mdev->backing_bdev;
+	bio->bi_private = req;
+	bio->bi_end_io  = drbd_dio_end;
+	bio->bi_next    = 0;
+
+	req->rq_status = RQ_DRBD_NOTHING;
+	req->mdev      = mdev;
+}
+
+static inline void
+drbd_req_prepare_read(drbd_dev *mdev, struct drbd_request *req)
+{
+	struct bio *bio;
+
+	bio = req->private_bio = bio_clone(req->master_bio, GFP_NOIO );
+	bio->bi_bdev    = mdev->backing_bdev;
+	bio->bi_private = req;
+	bio->bi_end_io  = drbd_read_bi_end_io;	// <- only difference
+	bio->bi_next    = 0;
+
+	req->rq_status = RQ_DRBD_NOTHING;
+	req->mdev      = mdev;
+}
+
+static inline struct page* drbd_bio_get_page(struct bio *bio)
+{
+	struct bio_vec *bvec = bio_iovec(bio);
+	return bvec->bv_page;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(int rw, struct bio *bio)
+{
+	drbd_dev *mdev = drbd_conf -1; // for DRBD_ratelimit
+	bio->bi_rw = rw; // on the receiver side, e->..rw was not yet defined.
+
+	if (!bio->bi_bdev) {
+		if (DRBD_ratelimit(5*HZ,5)) {
+			printk(KERN_ERR "drbd_generic_make_request: bio->bi_bdev == NULL\n");
+			dump_stack();
+		}
+		drbd_bio_IO_error(bio);
+		return;
+	}
+
+	generic_make_request(bio);
+}
+
+static inline void drbd_blk_run_queue(request_queue_t *q)
+{
+	if (q && q->unplug_fn)
+		q->unplug_fn(q);
+}
+
+static inline void drbd_kick_lo(drbd_dev *mdev)
+{
+	if (!mdev->backing_bdev) {
+		if (DRBD_ratelimit(5*HZ,5)) {
+			ERR("backing_bdev==NULL in drbd_kick_lo\n");
+			dump_stack();
+		}
+	} else {
+		drbd_blk_run_queue(bdev_get_queue(mdev->backing_bdev));
+	}
+}
+
+static inline void drbd_plug_device(drbd_dev *mdev)
+{
+	request_queue_t *q = bdev_get_queue(mdev->this_bdev);
+
+	spin_lock_irq(q->queue_lock);
+
+/* XXX the check on !blk_queue_plugged is redundant,
+ * implicitly checked in blk_plug_device */
+
+	if(!blk_queue_plugged(q)) {
+		blk_plug_device(q);
+		del_timer(&q->unplug_timer);
+		// unplugging should not happen automatically...
+	}
+	spin_unlock_irq(q->queue_lock);
+}
+
+static inline int _drbd_send_zc_bio(drbd_dev *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec = bio_iovec_idx(bio, bio->bi_idx);
+	return _drbd_send_page(mdev,bvec->bv_page,bvec->bv_offset,bvec->bv_len);
+}
+
+static inline int _drbd_send_bio(drbd_dev *mdev, struct bio *bio)
+{
+	struct bio_vec *bvec = bio_iovec(bio);
+	struct page *page = bvec->bv_page;
+	size_t size = bvec->bv_len;
+	int offset = bvec->bv_offset;
+	int ret;
+
+	ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
+	kunmap(page);
+	return ret;
+}
+
+#endif
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_fs.c	2006-02-10 15:15:53.000000000 +0300
@@ -0,0 +1,1436 @@
+/*
+-*- linux-c -*-
+   drbd_fs.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 1999-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+	main author.
+
+   Copyright (C) 2002-2004, Lars Ellenberg <l.g.e@web.de>.
+	main contributor.
+
+   Copyright (C) 2000, F�bio Oliv� Leite <olive@conectiva.com.br>.
+	Some sanity checks in IOCTL_SET_STATE.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/utsname.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+#include <linux/blkpg.h>
+
+ONLY_IN_26(
+/* see get_sb_bdev and bd_claim */
+char *drbd_sec_holder = "Secondary DRBD cannot be bd_claimed ;)";
+char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+STATIC enum { NotMounted=0,MountedRO,MountedRW } drbd_is_mounted(int minor)
+{
+       struct super_block *sb;
+
+       sb = get_super(MKDEV(MAJOR_NR, minor));
+       if(!sb) return NotMounted;
+
+       if(sb->s_flags & MS_RDONLY) {
+	       drop_super(sb);
+	       return MountedRO;
+       }
+
+       drop_super(sb);
+       return MountedRW;
+}
+#endif
+
+char* ppsize(char* buf, size_t size) 
+{
+	// Needs 9 bytes at max.
+	static char units[] = { 'K','M','G','T' };
+	int base = 0;
+	while (size >= 10000 ) {
+		size = size >> 10;
+		base++;
+	}
+	sprintf(buf,"%ld %cB",(long)size,units[base]);
+
+	return buf;
+}
+
+/* Returns -ENOMEM if we could not allocate the bitmap
+ *
+ * currently *_size is in KB.
+ *
+ * FIXME
+ * since this is done by drbd receiver as well as from drbdsetup,
+ * this actually needs proper locking!
+ * drbd_bm_resize already protects itself with a mutex.
+ * but again, this is a state change, and thus should be serialized with other
+ * state changes on a more general level already.
+ */
+int drbd_determin_dev_size(struct Drbd_Conf* mdev)
+{
+	sector_t pmdss; // previous meta data start sector
+	sector_t la_size;
+	sector_t size;
+	char ppb[10];
+
+	int md_moved, la_size_changed;
+	int rv=0;
+
+	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+	pmdss = drbd_md_ss(mdev);
+	la_size = mdev->la_size;
+
+	size = drbd_new_dev_size(mdev);
+
+	if( (drbd_get_capacity(mdev->this_bdev)>>1) != size ) {
+		int err;
+		err = drbd_bm_resize(mdev,size<<1); // wants sectors
+		if (unlikely(err)) {
+			/* currently there is only one error: ENOMEM! */
+			size = drbd_bm_capacity(mdev)>>1;
+			if (size == 0) {
+				ERR("OUT OF MEMORY! Could not allocate bitmap! Set device size => 0\n");
+			} else {
+				/* FIXME this is problematic,
+				 * if we in fact are smaller now! */
+				ERR("BM resizing failed. "
+				    "Leaving size unchanged at size = %lu KB\n", 
+				    (unsigned long)size);
+			}
+			rv = err;
+		}
+		// racy, see comments above.
+		drbd_set_my_capacity(mdev,size<<1);
+		mdev->la_size = size;
+		INFO("size = %s (%lu KB)\n",ppsize(ppb,size),
+		     (unsigned long)size);
+	}
+	if (rv < 0) goto out;
+
+	la_size_changed = (la_size != mdev->la_size);
+	md_moved = pmdss != drbd_md_ss(mdev) /* && mdev->md_index == -1 */;
+
+	if ( md_moved ) {
+		WARN("Moving meta-data.\n");
+		D_ASSERT(mdev->md_index == -1);
+	}
+
+	if ( la_size_changed || md_moved ) {
+		if( inc_local_md_only(mdev)) {
+			drbd_al_shrink(mdev); // All extents inactive.
+			drbd_bm_write(mdev);  // write bitmap
+			// Write mdev->la_size to on disk.
+			drbd_md_write(mdev);
+			dec_local(mdev);
+		}
+	}
+  out:
+	lc_unlock(mdev->act_log);
+
+	return rv;
+}
+
+/*
+ * currently *_size is in KB.
+ */
+sector_t drbd_new_dev_size(struct Drbd_Conf* mdev)
+{
+	sector_t p_size = mdev->p_size;  // partner's disk size.
+	sector_t la_size = mdev->la_size; // last agreed size.
+	sector_t m_size; // my size
+	sector_t u_size = mdev->lo_usize; // size requested by user.
+	sector_t size=0;
+
+	m_size = drbd_get_capacity(mdev->backing_bdev)>>1;
+
+	if (mdev->md_index == -1 && m_size) {// internal metadata
+		D_ASSERT(m_size > MD_RESERVED_SIZE);
+		m_size = drbd_md_ss(mdev)>>1;
+	}
+
+	if(p_size && m_size) {
+		size=min_t(sector_t,p_size,m_size);
+	} else {
+		if(la_size) {
+			size=la_size;
+			if(m_size && m_size < size) size=m_size;
+			if(p_size && p_size < size) size=p_size;
+		} else {
+			if(m_size) size=m_size;
+			if(p_size) size=p_size;
+		}
+	}
+
+	if(size == 0) {
+		ERR("Both nodes diskless!\n");
+	}
+
+	if(u_size) {
+		if(u_size > size) {
+			ERR("Requested disk size is too big (%lu > %lu)\n",
+			    (unsigned long)u_size, (unsigned long)size);
+		} else {
+			size = u_size;
+		}
+	}
+
+	return size;
+}
+
+/* checks that the al lru is of requested size, and if neccessary tries to
+ * allocate a new one. returns -EBUSY if current al lru is still used,
+ * -ENOMEM when allocation failed, and 0 on success.
+ */  
+STATIC int drbd_check_al_size(drbd_dev *mdev)
+{
+	struct lru_cache *n,*t;
+	struct lc_element *e;
+	unsigned int in_use;
+	int i;
+
+	ERR_IF(mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+	if ( mdev->act_log &&
+	     mdev->act_log->nr_elements == mdev->sync_conf.al_extents )
+		return 0;
+
+	in_use = 0;
+	t = mdev->act_log;
+	n = lc_alloc(mdev->sync_conf.al_extents,
+		     sizeof(struct lc_element), mdev);
+
+	if (n==NULL) {
+		ERR("Cannot allocate act_log lru!\n");
+		return -ENOMEM;
+	}
+	spin_lock_irq(&mdev->al_lock);
+	if (t) {
+		for (i=0; i < t->nr_elements; i++) {
+			e = lc_entry(t,i);
+			if (e->refcnt)
+				ERR("refcnt(%d)==%d\n",
+				    e->lc_number, e->refcnt);
+			in_use += e->refcnt;
+		}
+	}
+	if (!in_use) {
+		mdev->act_log = n;
+	}
+	spin_unlock_irq(&mdev->al_lock);
+	if (in_use) {
+		ERR("Activity log still in use!\n");
+		lc_free(n);
+		return -EBUSY;
+	} else {
+		if (t) lc_free(t);
+	}
+	drbd_md_write(mdev);
+	return 0;
+}
+
+STATIC int drbd_detach_ioctl(drbd_dev *mdev);
+
+STATIC
+int drbd_ioctl_set_disk(struct Drbd_Conf *mdev,
+			struct ioctl_disk_config * arg)
+{
+	NOT_IN_26(int err;) // unused in 26 ?? cannot believe it ...
+	int i, md_gc_valid, minor, mput=0;
+	enum ret_codes retcode;
+	struct disk_config new_conf;
+	struct file *filp = 0;
+	struct file *filp2 = 0;
+	struct inode *inode, *inode2;
+	NOT_IN_26(kdev_t bdev, bdev2;)
+	ONLY_IN_26(struct block_device *bdev, *bdev2;)
+
+	minor=(int)(mdev-drbd_conf);
+
+	/* if you want to reconfigure, please tear down first */
+	smp_rmb();
+	if (!test_bit(DISKLESS,&mdev->flags))
+		return -EBUSY;
+
+	/* if this was "adding" a lo dev to a previously "diskless" node,
+	 * there still could be requests comming in right now. brrks.
+	 * if it was mounted, we had an open_cnt > 1,
+	 * so it would be BUSY anyways...
+	 */
+	ERR_IF (mdev->state != Secondary)
+		return -EBUSY;
+
+	if (mdev->open_cnt > 1)
+		return -EBUSY;
+
+	if (copy_from_user(&new_conf, &arg->config,sizeof(struct disk_config)))
+		return -EFAULT;
+
+	/* FIXME
+	 * I'd like to do it here, so I can just fail this ioctl with ENOMEM.
+	 * but drbd_md_read below might change the al_nr_extens again, so need
+	 * to do it there again anyways...
+	 * but then I already changed it all and cannot easily undo it..
+	 * for now, do it there, but then if it fails, rather panic than later
+	 * have a NULL pointer dereference.
+	 *
+	i = drbd_check_al_size(mdev);
+	if (i) return i;
+	 *
+	 */
+
+	if (mdev->cstate == Unconfigured) {
+		// ioctl already has a refcnt
+		__module_get(THIS_MODULE);
+		mput = 1;
+	} else {
+		/* We currently cannot handle reattach while connected */
+		return -EBUSY;
+
+		/* FIXME allow reattach while connected,
+		 * and allow it in Primary/Diskless state...
+		 * currently there are strange races leading to a distributed
+		 * deadlock in that case...
+		 */
+		if ( mdev->cstate != StandAlone /* &&
+		    mdev->cstate != Connected */) {
+			return -EBUSY;
+		}
+	}
+
+	if ( new_conf.meta_index < -1) {
+		retcode=LDMDInvalid;
+		goto fail_ioctl;
+	}
+
+	filp = fget(new_conf.lower_device);
+	if (!filp) {
+		retcode=LDFDInvalid;
+		goto fail_ioctl;
+	}
+
+	inode = filp->f_dentry->d_inode;
+
+	if (!S_ISBLK(inode->i_mode)) {
+		retcode=LDNoBlockDev;
+		goto fail_ioctl;
+	}
+
+	filp2 = fget(new_conf.meta_device);
+
+	if (!filp2) {
+		retcode=MDFDInvalid;
+		goto fail_ioctl;
+	}
+
+	inode2 = filp2->f_dentry->d_inode;
+
+	if (!S_ISBLK(inode2->i_mode)) {
+		retcode=MDNoBlockDev;
+		goto fail_ioctl;
+	}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)
+	bdev = inode->i_bdev;
+	if (bd_claim(bdev, mdev)) {
+		retcode=LDMounted;
+		goto fail_ioctl;
+	}
+
+	bdev2 = inode2->i_bdev;
+	if (bd_claim(bdev2, new_conf.meta_index== - 1 ? 
+		     (void *)mdev : (void*) drbd_m_holder )) {
+		retcode=MDMounted;
+		goto release_bdev_fail_ioctl;
+	}
+#else
+	for(i=0;i<minor_count;i++) {
+		if( i != minor &&
+		    inode->i_rdev == drbd_conf[i].backing_bdev) {
+			retcode=LDAlreadyInUse;
+			goto fail_ioctl;
+		}
+	}
+
+	if (drbd_is_mounted(inode->i_rdev)) {
+		WARN("can not configure %d:%d, has active inodes!\n",
+		     MAJOR(inode->i_rdev), MINOR(inode->i_rdev));
+		retcode=LDMounted;
+		goto fail_ioctl;
+	}
+
+	if ((err = blkdev_open(inode, filp))) {
+		ERR("blkdev_open( %d:%d ,) returned %d\n",
+		    MAJOR(inode->i_rdev), MINOR(inode->i_rdev), err);
+		retcode=LDOpenFailed;
+		goto fail_ioctl;
+	}
+	bdev = inode->i_rdev;
+
+	if ((err = blkdev_open(inode2, filp2))) {
+		ERR("blkdev_open( %d:%d ,) returned %d\n",
+		    MAJOR(inode->i_rdev), MINOR(inode->i_rdev), err);
+		retcode=MDOpenFailed;
+		goto release_bdev_fail_ioctl;
+	}
+	bdev2 = inode2->i_rdev;
+#endif
+
+	if ( (bdev == bdev2) != (new_conf.meta_index == -1) ) {
+		retcode=LDMDInvalid;
+		goto release_bdev2_fail_ioctl;
+	}
+
+	if ((drbd_get_capacity(bdev)>>1) < new_conf.disk_size) {
+		retcode = LDDeviceTooSmall;
+		goto release_bdev2_fail_ioctl;
+	}
+
+	if (drbd_get_capacity(bdev) > DRBD_MAX_SECTORS) {
+		retcode = LDDeviceTooLarge;
+		goto release_bdev2_fail_ioctl;
+	}
+
+	if ( new_conf.meta_index == -1 ) i = 1;
+	else i = new_conf.meta_index+1;
+
+	/* for internal, we need to check agains <= (then we have a drbd with
+	 * zero size, but meta data...) to be on the safe side, I require 32MB
+	 * minimal data storage area for drbd with internal meta data (thats
+	 * 160 total).  if someone wants to use that small devices, she can use
+	 * drbd 0.6 anyways...
+	 *
+	 * FIXME this is arbitrary and needs to be reconsidered as soon as we
+	 * move to flexible size meta data.
+	 */
+	if( drbd_get_capacity(bdev2) < 2*MD_RESERVED_SIZE*i
+				+ (new_conf.meta_index == -1) ? (1<<16) : 0 )
+	{
+		retcode = MDDeviceTooSmall;
+		goto release_bdev2_fail_ioctl;
+	}
+
+	drbd_free_ll_dev(mdev);
+
+	mdev->md_bdev  = bdev2;
+	mdev->md_file  = filp2;
+	mdev->md_index = new_conf.meta_index;
+
+	mdev->backing_bdev = bdev;
+	mdev->lo_file  = filp;
+	mdev->lo_usize = new_conf.disk_size;
+	mdev->on_io_error = new_conf.on_io_error;
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+	mdev->read_cnt = 0;
+	mdev->writ_cnt = 0;
+
+// FIXME unclutter the code again ;)
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+ONLY_IN_26({
+	request_queue_t * const q = mdev->rq_queue;
+	request_queue_t * const b = bdev->bd_disk->queue;
+
+	q->max_sectors = min_not_zero((unsigned short)(PAGE_SIZE >> 9), b->max_sectors);
+	q->max_phys_segments = 1;
+	q->max_hw_segments   = 1;
+	q->max_segment_size  = min((unsigned)PAGE_SIZE,b->max_segment_size);
+	q->hardsect_size     = max((unsigned short)512,b->hardsect_size);
+	q->seg_boundary_mask = PAGE_SIZE-1;
+	D_ASSERT(q->hardsect_size <= PAGE_SIZE); // or we are really screwed ;-)
+})
+#undef min_not_zero
+
+	clear_bit(SENT_DISK_FAILURE,&mdev->flags);
+	set_bit(MD_IO_ALLOWED,&mdev->flags);
+
+/* FIXME I think inc_local_md_only within drbd_md_read is misplaced.
+ * should go here, and the corresponding dec_local, too.
+ */
+
+	md_gc_valid = drbd_md_read(mdev);
+
+/* FIXME if (md_gc_valid < 0) META DATA IO NOT POSSIBLE! */
+
+	/* If I am currently not Primary,
+	 * but meta data primary indicator is set,
+	 * I just now recover from a hard crash,
+	 * and have been Primary before that crash.
+	 *
+	 * Now, if I had no connection before that crash
+	 * (have been degraded Primary), chances are that
+	 * I won't find my peer now either.
+	 *
+	 * In that case, and _only_ in that case,
+	 * we use the degr-wfc-timeout instead of the default,
+	 * so we can automatically recover from a crash of a
+	 * degraded but active "cluster" after a certain timeout.
+	 */
+	clear_bit(USE_DEGR_WFC_T,&mdev->flags);
+	if ( mdev->state != Primary &&
+	     drbd_md_test_flag(mdev,MDF_PrimaryInd) &&
+	    !drbd_md_test_flag(mdev,MDF_ConnectedInd) ) {
+		set_bit(USE_DEGR_WFC_T,&mdev->flags);
+	}
+
+	drbd_bm_lock(mdev); // racy...
+
+	if(drbd_md_test_flag(mdev,MDF_Consistent) &&
+	   drbd_new_dev_size(mdev) < mdev->la_size ) {
+		D_ASSERT(mdev->cstate == Unconfigured);
+		D_ASSERT(mput == 1);
+		/* Do not attach a too small disk.*/
+		drbd_bm_unlock(mdev);
+		ERR("Lower device smaller than last agreed size!\n");
+		drbd_free_ll_dev(mdev);
+		set_cstate(mdev,Unconfigured);
+		retcode = LDDeviceTooSmall;
+		module_put(THIS_MODULE);
+		if (put_user(retcode, &arg->ret_code)) return -EFAULT;
+		return -EINVAL;
+	}
+	if (drbd_determin_dev_size(mdev) < 0) {
+		/* could not allocate bitmap.
+		 * try to undo ... */
+		D_ASSERT(mdev->cstate == Unconfigured);
+		D_ASSERT(mput == 1);
+
+		drbd_bm_unlock(mdev);
+
+		/* from drbd_detach_ioctl */
+		drbd_free_ll_dev(mdev);
+
+		set_cstate(mdev,Unconfigured);
+		drbd_mdev_cleanup(mdev);
+		module_put(THIS_MODULE);
+		return -ENOMEM;
+	}
+
+	if (md_gc_valid <= 0) {
+		INFO("Assuming that all blocks are out of sync (aka FullSync)\n");
+		drbd_bm_set_all(mdev);
+		drbd_bm_write(mdev);
+		drbd_md_clear_flag(mdev,MDF_FullSync);
+		drbd_md_write(mdev);
+	} else { // md_gc_valid > 0
+		/* FIXME this still does not propagate io errors! */
+		drbd_bm_read(mdev);
+	}
+
+	i = drbd_check_al_size(mdev);
+	if (i) {
+		/* FIXME see the comment above.
+		 * if this fails I need to undo all changes,
+		 * go back into Unconfigured,
+		 * and fail the ioctl with ENOMEM...
+		 */
+		// return i;
+		drbd_panic("Cannot allocate act_log\n");
+	}
+
+	if (md_gc_valid > 0) {
+		drbd_al_read_log(mdev);
+		if (drbd_md_test_flag(mdev,MDF_PrimaryInd)) {
+			drbd_al_apply_to_bm(mdev);
+			drbd_al_to_on_disk_bm(mdev);
+		}
+	} /* else {
+	     FIXME wipe out on disk al!
+	} */
+
+	drbd_set_blocksize(mdev,INITIAL_BLOCK_SIZE);
+
+	if(mdev->cstate == Unconfigured ) {
+		drbd_thread_start(&mdev->worker);
+		set_cstate(mdev,StandAlone);
+	}
+
+
+	clear_bit(DISKLESS,&mdev->flags);
+	smp_wmb();
+// FIXME EXPLAIN:
+	clear_bit(MD_IO_ALLOWED,&mdev->flags);
+
+	/* FIXME currently only StandAlone here...
+	 * Connected is not possible, since
+	 * above we return -EBUSY in that case  */
+	D_ASSERT(mdev->cstate <= Connected);
+	if(mdev->cstate == Connected ) {
+		drbd_send_param(mdev,1);
+	}
+	drbd_bm_unlock(mdev);
+
+	return 0;
+
+ release_bdev2_fail_ioctl:
+	NOT_IN_26(blkdev_put(filp2->f_dentry->d_inode->i_bdev,BDEV_FILE);)
+	ONLY_IN_26(bd_release(bdev2);)
+ release_bdev_fail_ioctl:
+	NOT_IN_26(blkdev_put(filp->f_dentry->d_inode->i_bdev,BDEV_FILE);)
+	ONLY_IN_26(bd_release(bdev);)
+ fail_ioctl:
+	if (mput) module_put(THIS_MODULE);
+	if (filp) fput(filp);
+	if (filp2) fput(filp2);
+	if (put_user(retcode, &arg->ret_code)) return -EFAULT;
+	return -EINVAL;
+}
+
+STATIC
+int drbd_ioctl_get_conf(struct Drbd_Conf *mdev, struct ioctl_get_config* arg)
+{
+	struct ioctl_get_config cn;
+	memset(&cn,0,sizeof(cn));
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+	if (mdev->backing_bdev) {
+		cn.lower_device_major = MAJOR(mdev->backing_bdev->bd_dev);
+		cn.lower_device_minor = MINOR(mdev->backing_bdev->bd_dev);
+		bdevname(mdev->backing_bdev,cn.lower_device_name);
+	}
+	if (mdev->md_bdev) {
+		cn.meta_device_major  = MAJOR(mdev->md_bdev->bd_dev);
+		cn.meta_device_minor  = MINOR(mdev->md_bdev->bd_dev);
+		bdevname(mdev->md_bdev,cn.meta_device_name);
+	}
+#else
+	cn.lower_device_major=MAJOR(mdev->backing_bdev);
+	cn.lower_device_minor=MINOR(mdev->backing_bdev);
+	cn.meta_device_major=MAJOR(mdev->md_bdev);
+	cn.meta_device_minor=MINOR(mdev->md_bdev);
+	if (mdev->backing_bdev) {
+		strncpy(cn.lower_device_name,
+				bdevname(mdev->backing_bdev), BDEVNAME_SIZE);
+	}
+	if (mdev->md_bdev) {
+		strncpy(cn.meta_device_name,
+				bdevname(mdev->md_bdev), BDEVNAME_SIZE);
+	}
+#endif
+	cn.cstate=mdev->cstate;
+	cn.state=mdev->state;
+	cn.peer_state=mdev->o_state;
+	cn.disk_size_user=mdev->lo_usize;
+	cn.meta_index=mdev->md_index;
+	cn.on_io_error=mdev->on_io_error;
+	memcpy(&cn.nconf, &mdev->conf, sizeof(struct net_config));
+	memcpy(&cn.sconf, &mdev->sync_conf, sizeof(struct syncer_config));
+
+	if (copy_to_user(arg,&cn,sizeof(struct ioctl_get_config)))
+		return -EFAULT;
+
+	return 0;
+}
+
+
+STATIC
+int drbd_ioctl_set_net(struct Drbd_Conf *mdev, struct ioctl_net_config * arg)
+{
+	int i,minor, mput=0;
+	enum ret_codes retcode;
+	struct net_config new_conf;
+
+	minor=(int)(mdev-drbd_conf);
+
+	// FIXME plausibility check
+	if (copy_from_user(&new_conf, &arg->config,sizeof(struct net_config)))
+		return -EFAULT;
+
+	if (mdev->cstate == Unconfigured) {
+		// ioctl already has a refcnt
+		__module_get(THIS_MODULE);
+		mput = 1;
+	}
+
+#define M_ADDR(A) (((struct sockaddr_in *)&A.my_addr)->sin_addr.s_addr)
+#define M_PORT(A) (((struct sockaddr_in *)&A.my_addr)->sin_port)
+#define O_ADDR(A) (((struct sockaddr_in *)&A.other_addr)->sin_addr.s_addr)
+#define O_PORT(A) (((struct sockaddr_in *)&A.other_addr)->sin_port)
+	for(i=0;i<minor_count;i++) {
+		if( i!=minor && drbd_conf[i].cstate!=Unconfigured &&
+		    M_ADDR(new_conf) == M_ADDR(drbd_conf[i].conf) &&
+		    M_PORT(new_conf) == M_PORT(drbd_conf[i].conf) ) {
+			retcode=LAAlreadyInUse;
+			goto fail_ioctl;
+		}
+		if( i!=minor && drbd_conf[i].cstate!=Unconfigured &&
+		    O_ADDR(new_conf) == O_ADDR(drbd_conf[i].conf) &&
+		    O_PORT(new_conf) == O_PORT(drbd_conf[i].conf) ) {
+			retcode=OAAlreadyInUse;
+			goto fail_ioctl;
+		}
+	}
+#undef M_ADDR
+#undef M_PORT
+#undef O_ADDR
+#undef O_PORT
+
+	/* IMPROVE:
+	   We should warn the user if the LL_DEV is
+	   used already. E.g. some FS mounted on it.
+	*/
+
+	drbd_sync_me(mdev);
+	drbd_thread_stop(&mdev->receiver);
+	drbd_free_sock(mdev);
+
+	// TODO plausibility check ...
+	memcpy(&mdev->conf,&new_conf,sizeof(struct net_config));
+
+#if 0
+FIXME
+	/* for the connection loss logic in drbd_recv
+	 * I _need_ the resulting timeo in jiffies to be
+	 * non-zero and different
+	 *
+	 * XXX maybe rather store the value scaled to jiffies?
+	 * Note: MAX_SCHEDULE_TIMEOUT/HZ*HZ != MAX_SCHEDULE_TIMEOUT
+	 *       and HZ > 10; which is unlikely to change...
+	 *       Thus, if interrupted by a signal,
+	 *       sock_{send,recv}msg returns -EINTR,
+	 *       if the timeout expires, -EAGAIN.
+	 */
+	// unlikely: someone disabled the timeouts ...
+	// just put some huge values in there.
+	if (!mdev->conf.ping_int)
+		mdev->conf.ping_int = MAX_SCHEDULE_TIMEOUT/HZ;
+	if (!mdev->conf.timeout)
+		mdev->conf.timeout = MAX_SCHEDULE_TIMEOUT/HZ*10;
+	if (mdev->conf.ping_int*10 < mdev->conf.timeout)
+		mdev->conf.timeout = mdev->conf.ping_int*10/6;
+	if (mdev->conf.ping_int*10 == mdev->conf.timeout)
+		mdev->conf.ping_int = mdev->conf.ping_int+1;
+#endif
+
+	mdev->send_cnt = 0;
+	mdev->recv_cnt = 0;
+
+	drbd_thread_start(&mdev->worker);
+	set_cstate(mdev,Unconnected);
+	drbd_thread_start(&mdev->receiver);
+
+	return 0;
+
+  fail_ioctl:
+	if (mput) module_put(THIS_MODULE);
+	if (put_user(retcode, &arg->ret_code)) return -EFAULT;
+	return -EINVAL;
+}
+
+int drbd_set_state(drbd_dev *mdev,Drbd_State newstate)
+{
+	int forced = 0;
+	int dont_have_good_data;
+	NOT_IN_26(int minor = mdev-drbd_conf;)
+
+	D_ASSERT(semaphore_is_locked(&mdev->device_mutex));
+
+	if ( (newstate & 0x3) == mdev->state ) return 0; /* nothing to do */
+
+	// exactly one of sec or pri. not both.
+	if ( !((newstate ^ (newstate >> 1)) & 1) ) return -EINVAL;
+
+	if(mdev->cstate == Unconfigured)
+		return -ENXIO;
+
+	if ( (newstate & Primary) && (mdev->o_state == Primary) )
+		return -EACCES;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+	smp_rmb();
+	if ( (newstate & Secondary) &&
+	   (test_bit(WRITER_PRESENT,&mdev->flags) ||
+	    drbd_is_mounted(minor) == MountedRW))
+		return -EBUSY;
+#else
+	ERR_IF (mdev->this_bdev->bd_contains == 0) {
+		// FIXME this masks a bug somewhere else!
+		mdev->this_bdev->bd_contains = mdev->this_bdev;
+	}
+
+	if ( newstate & Secondary ) {
+		/* If I got here, I am Primary. I claim me for myself. If that
+		 * does not succeed, someone other has claimed me, so I cannot
+		 * become Secondary. */
+		if (bd_claim(mdev->this_bdev,drbd_sec_holder))
+			return -EBUSY;
+		if (disable_bd_claim)
+			bd_release(mdev->this_bdev);
+	}
+#endif
+
+
+	/* I dont have access to good data anywhere, if:
+	 *  ( I am diskless OR inconsistent )
+	 *  AND
+	 *  ( not connected, or partner has no consistent data either )
+	 */
+	dont_have_good_data =
+		(    test_bit(DISKLESS, &mdev->flags)
+		  || !drbd_md_test_flag(mdev,MDF_Consistent) )
+		&&
+		( mdev->cstate < Connected
+		  || test_bit(PARTNER_DISKLESS, &mdev->flags)
+		  || !test_bit(PARTNER_CONSISTENT, &mdev->flags) );
+
+	if (newstate & Primary) {
+		if ( test_bit(DISKLESS,&mdev->flags)
+		    && mdev->cstate < Connected ) {
+			/* not even brute force can find data without disk.
+			 * FIXME choose a usefull Error,
+			 * and update drbsetup accordingly */
+			return -EIO;
+		} else if (dont_have_good_data) {
+			/* ok, either we have a disk (which may be inconsistent)
+			 * or we have a connection */
+			if (newstate & DontBlameDrbd) {
+				forced = 1;
+				/* make sure the Human count is increased if
+				 * we got here only because it was forced.
+				 * maybe we want to force a FullSync? */
+				newstate |= Human;
+			} else {
+				return -EIO;
+			}
+		} else if (mdev->cstate >= Connected) {
+			/* do NOT increase the Human count if we are connected,
+			 * and there is no reason for it.  See
+			 * drbd_lk9.pdf middle of Page 7
+			 */
+			newstate &= ~(Human|DontBlameDrbd);
+		}
+	}
+
+	drbd_sync_me(mdev);
+
+	/* Wait until nothing is on the fly :) */
+	if ( wait_event_interruptible( mdev->cstate_wait,
+			atomic_read(&mdev->ap_pending_cnt) == 0 ) ) {
+ONLY_IN_26(
+		if ( newstate & Secondary ) {
+			D_ASSERT(mdev->this_bdev->bd_holder == drbd_sec_holder);
+			bd_release(mdev->this_bdev);
+		}
+)
+		return -EINTR;
+	}
+
+	/* FIXME RACE here: if our direct user is not using bd_claim (i.e. 
+	 *  not a filesystem) since cstate might still be >= Connected, new 
+	 * ap requests may come in and increase ap_pending_cnt again!
+	 * but that means someone is misusing DRBD...
+	 * */
+
+	if (forced) { /* this was --do-what-I-say ... */
+		int i;
+		// drbd_dump_md(mdev,0,0);
+		for (i=HumanCnt; i < GEN_CNT_SIZE ; i++) {
+			if (mdev->gen_cnt[i] != 1) {
+				WARN("Forcefully set consistent! "
+				     "If this screws your data, don't blame DRBD!\n");
+				break;
+			}
+		}
+		drbd_md_set_flag(mdev,MDF_Consistent);
+	}
+	set_bit(MD_DIRTY,&mdev->flags); // we are changing state!
+	INFO( "%s/%s --> %s/%s\n",
+	      nodestate_to_name(mdev->state),
+	      nodestate_to_name(mdev->o_state),
+	      nodestate_to_name(newstate & 0x03),
+	      nodestate_to_name(mdev->o_state)   );
+	mdev->state = (Drbd_State) newstate & 0x03;
+	if(newstate & Primary) {
+		NOT_IN_26( set_device_ro(MKDEV(MAJOR_NR, minor), FALSE ); )
+
+ONLY_IN_26(
+		set_disk_ro(mdev->vdisk, FALSE );
+		D_ASSERT(mdev->this_bdev->bd_holder == drbd_sec_holder);
+		bd_release(mdev->this_bdev);
+		mdev->this_bdev->bd_disk = mdev->vdisk;
+)
+
+		if(test_bit(ON_PRI_INC_HUMAN,&mdev->flags)) {
+			newstate |= Human;
+			clear_bit(ON_PRI_INC_HUMAN,&mdev->flags);
+		}
+
+		if(test_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags)) {
+			newstate |= TimeoutExpired;
+			clear_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags);
+		}
+
+		if(newstate & Human) {
+			drbd_md_inc(mdev,HumanCnt);
+		} else if(newstate & TimeoutExpired ) {
+			drbd_md_inc(mdev,TimeoutCnt);
+		} else {
+			drbd_md_inc(mdev,
+			    mdev->cstate >= Connected ?
+			    ConnectedCnt : ArbitraryCnt);
+		}
+	} else {
+		NOT_IN_26( set_device_ro(MKDEV(MAJOR_NR, minor), TRUE ); )
+		ONLY_IN_26( set_disk_ro(mdev->vdisk, TRUE ); )
+	}
+
+	if(!test_bit(DISKLESS,&mdev->flags) && (newstate & Secondary)) {
+		drbd_al_to_on_disk_bm(mdev);
+	}
+	/* Primary indicator has changed in any case. */
+	drbd_md_write(mdev);
+
+	if (mdev->cstate >= WFReportParams) {
+		/* if this was forced, we should consider sync */
+		drbd_send_param(mdev,forced);
+	}
+
+	return 0;
+}
+
+static int drbd_get_wait_time(long *tp, struct Drbd_Conf *mdev,
+			      struct ioctl_wait *arg)
+{
+	long time;
+	struct ioctl_wait p;
+
+	if(copy_from_user(&p,arg,sizeof(p))) {
+		return -EFAULT;
+	}
+
+	if ( test_bit(USE_DEGR_WFC_T,&mdev->flags) ) {
+		time=p.degr_wfc_timeout;
+		if (time) WARN("using degr_wfc_timeout=%ld seconds\n", time);
+	} else {
+		time=p.wfc_timeout;
+	}
+
+	time=time*HZ;
+	if(time==0) time=MAX_SCHEDULE_TIMEOUT;
+
+	*tp=time;
+
+	return 0;
+}
+
+STATIC int drbd_ioctl_set_syncer(struct Drbd_Conf *mdev,
+				 struct ioctl_syncer_config* arg)
+{
+	struct syncer_config sc;
+	int err;
+
+	if(copy_from_user(&sc,&arg->config,sizeof(sc))) return -EFAULT;
+
+	sc.use_csums = 0; // TODO, NYI
+	ERR_IF (sc.rate < 1) sc.rate = 1;
+	ERR_IF (sc.skip & ~1) sc.skip = !!sc.skip;
+	ERR_IF (sc.al_extents < 7) sc.al_extents = 127; // arbitrary minimum
+#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
+	if(sc.al_extents > AL_MAX) {
+		ERR("sc.al_extents > %d\n",AL_MAX);
+		sc.al_extents = AL_MAX;
+	}
+#undef AL_MAX
+
+	mdev->sync_conf.rate       = sc.rate;
+	mdev->sync_conf.use_csums  = sc.use_csums;
+	mdev->sync_conf.skip       = sc.skip;
+	mdev->sync_conf.al_extents = sc.al_extents;
+
+	err = drbd_check_al_size(mdev);
+	if (err) return err;
+
+	if (mdev->cstate > WFConnection)
+		drbd_send_sync_param(mdev,&sc);
+
+	drbd_alter_sg(mdev, sc.group);
+
+	return 0;
+}
+
+STATIC int drbd_detach_ioctl(drbd_dev *mdev)
+{
+	int would_discard_last_good_data;
+	int interrupted;
+
+	// not during resync. no.
+	if (mdev->cstate > Connected) return -EBUSY;
+
+	/* this was the last good data copy, if:
+	 *  (I am Primary, and not connected ),
+	 *  OR
+	 *  (we are connected, and Peer has no good data himself)
+	 */
+	would_discard_last_good_data =
+		( mdev->state == Primary && mdev->cstate < Connected )
+		||
+		( mdev->cstate >= Connected
+		  && (    test_bit(PARTNER_DISKLESS, &mdev->flags)
+		      || !test_bit(PARTNER_CONSISTENT, &mdev->flags) ) );
+
+	if ( would_discard_last_good_data ) {
+		return -ENETRESET;
+	}
+	if (test_bit(DISKLESS,&mdev->flags) ||
+	    test_bit(PARTNER_DISKLESS,&mdev->flags) ) {
+		return -ENXIO;
+	}
+
+	drbd_sync_me(mdev);
+
+	set_bit(DISKLESS,&mdev->flags);
+	smp_wmb();
+
+	interrupted = wait_event_interruptible(mdev->cstate_wait,
+				      atomic_read(&mdev->local_cnt)==0);
+	if ( interrupted ) {
+		clear_bit(DISKLESS,&mdev->flags);
+		return -EINTR;
+	}
+
+	drbd_free_ll_dev(mdev);
+
+/* FIXME race with sync start
+*/
+	if (mdev->cstate == Connected) drbd_send_param(mdev,0);
+/* FIXME
+* if you detach while connected, you are *at least* inconsistent now,
+* and should clear MDF_Consistent in metadata, and maybe even set the bitmap
+* out of sync.
+* since if you reattach, this might be a different lo dev, and then it needs
+* to receive a sync!
+*/
+	if (mdev->cstate == StandAlone) {
+		// maybe  < Connected is better?
+		set_cstate(mdev,Unconfigured);
+		drbd_mdev_cleanup(mdev);
+		module_put(THIS_MODULE);
+	}
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+long drbd_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg)
+{
+	int ret;
+	// lock_kernel(); Not needed, since we have mdev->device_mutex
+	ret = drbd_ioctl(f->f_dentry->d_inode, f, cmd, arg);
+	// unlock_kernel();
+	return ret;
+}
+#endif
+
+int drbd_ioctl(struct inode *inode, struct file *file,
+			   unsigned int cmd, unsigned long arg)
+{
+	int minor,err=0;
+	long time;
+	struct Drbd_Conf *mdev;
+	struct ioctl_wait* wp;
+ONLY_IN_26(
+	struct block_device *bdev = inode->i_bdev;
+	struct gendisk *disk = bdev->bd_disk;
+)
+
+	minor = MINOR(inode->i_rdev);
+	if (minor >= minor_count) return -ENODEV;
+	mdev = drbd_conf + minor;
+
+	D_ASSERT(MAJOR(inode->i_rdev) == MAJOR_NR);
+
+	/*
+	 * check whether we can permit this ioctl, and whether is makes sense.
+	 * we don't care for the BLK* ioctls, with 2.6 they never end up here.
+	 *
+	 * for non-sysadmins, we only allow GET_CONFIG (and GET_VERSION)
+	 * all other things need CAP_SYS_ADMIN.
+	 *
+	 * on an Unconfigured device, only configure requests make sense.
+	 * still we silently ignore requests to become secondary or to
+	 * unconfigure. other requests are invalid.
+	 *
+	 * I chose to have an additional switch statement for it
+	 * because I think this makes it more obvious.
+	 *
+	 * because we look at mdev->cstate, it should be inside the lock
+	 * (once we serialize cstate changes, it has to be...)
+	 *
+	 */
+	if (!capable(CAP_SYS_ADMIN)
+	    && cmd != DRBD_IOCTL_GET_CONFIG
+	    && cmd != DRBD_IOCTL_GET_VERSION) {
+		err = -EPERM;
+		goto out_unlocked;
+	}
+
+	if (mdev->cstate == Unconfigured) {
+		switch (cmd) {
+		default:
+			/* oops, unknown IOCTL ?? */
+			err = -EINVAL;
+			goto out_unlocked;
+
+		case DRBD_IOCTL_GET_CONFIG:
+		case DRBD_IOCTL_GET_VERSION:
+			break;		/* always allowed */
+
+		case DRBD_IOCTL_SET_DISK_CONFIG:
+		case DRBD_IOCTL_SET_NET_CONFIG:
+			break;		/* no restriction here */
+
+		case DRBD_IOCTL_UNCONFIG_DISK:
+		case DRBD_IOCTL_UNCONFIG_NET:
+			/* no op, so "drbdadm down all" does not fail */
+			err = 0;
+			goto out_unlocked;
+
+		/* the rest of them don't make sense if Unconfigured.
+		 * still, set an Unconfigured device Secondary
+		 * is allowed, so "drbdadm down all" does not fail */
+		case DRBD_IOCTL_SET_STATE:
+		case DRBD_IOCTL_INVALIDATE:
+		case DRBD_IOCTL_INVALIDATE_REM:
+		case DRBD_IOCTL_SET_DISK_SIZE:
+		case DRBD_IOCTL_SET_STATE_FLAGS:
+		case DRBD_IOCTL_SET_SYNC_CONFIG:
+		case DRBD_IOCTL_WAIT_CONNECT:
+		case DRBD_IOCTL_WAIT_SYNC:
+			err = (cmd == DRBD_IOCTL_SET_STATE && arg == Secondary)
+				    ? 0 : -ENXIO;
+			goto out_unlocked;
+		}
+	}
+
+	if (unlikely(drbd_did_panic == DRBD_MAGIC))
+		return -EBUSY;
+
+	if( (err=down_interruptible(&mdev->device_mutex)) ) return err;
+	/*
+	 * please no 'return', use 'err = -ERRNO; goto out;'
+	 * we hold the device_mutex
+	 */
+
+ONLY_IN_26(
+	D_ASSERT(bdev == mdev->this_bdev);
+	D_ASSERT(disk == mdev->vdisk);
+);
+
+	smp_rmb();
+	switch (cmd) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+/* see how sys_ioctl and blkdev_ioctl handle it in 2.6 .
+ * If I understand correctly, only "private" ioctl end up here.
+ */
+	case BLKGETSIZE:
+		err = put_user(drbd_get_capacity(mdev->this_bdev),(long *)arg);
+		break;
+
+#ifdef BLKGETSIZE64
+	case BLKGETSIZE64: /* see ./drivers/block/loop.c */
+		err = put_user((u64)drbd_get_capacity(mdev->this_bdev)<<9, 
+			       (u64*)arg);
+		break;
+#endif
+
+	case BLKROSET:  // THINK do we want to intercept this one ?
+	case BLKROGET:
+	case BLKFLSBUF:
+	case BLKSSZGET:
+	case BLKBSZGET:
+	case BLKBSZSET: // THINK do we want to intercept this one ?
+	case BLKPG:
+		err=blk_ioctl(inode->i_rdev, cmd, arg);
+		break;
+#endif
+	case DRBD_IOCTL_GET_VERSION:
+		err = put_user(API_VERSION, (int *) arg);
+		break;
+
+	case DRBD_IOCTL_SET_STATE:
+		if (arg & ~(Primary|Secondary|Human|TimeoutExpired|
+			    DontBlameDrbd) ) {
+			err = -EINVAL;
+		} else {
+			err = drbd_set_state(mdev,arg);
+		}
+		break;
+
+	case DRBD_IOCTL_SET_STATE_FLAGS:
+		if (arg & ~(Human|TimeoutExpired) ) {
+			err = -EINVAL;
+		} else {
+			clear_bit(ON_PRI_INC_HUMAN,&mdev->flags);
+			clear_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags);
+			if (arg == 0) break;
+
+			// XXX reduce race: don't set it,
+			// if we have a connection.
+			// this does not avoid the race completely, though.
+			if (mdev->cstate > WFConnection) {
+				WARN("race avoidance: did not set "
+				     "the state flags (%s), cstate=%s\n",
+				        arg == (Human|TimeoutExpired)
+				     ?  "Human|TimeoutExpired"
+				     : arg == Human
+				     ? "Human"
+				     : "TimeoutExpired",
+				     cstate_to_name(mdev->cstate));
+				break;
+			}
+
+			if (arg & Human)
+				set_bit(ON_PRI_INC_HUMAN,&mdev->flags);
+			if (arg & TimeoutExpired)
+				set_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags);
+		}
+		break;
+
+	case DRBD_IOCTL_SET_DISK_CONFIG:
+		err = drbd_ioctl_set_disk(mdev,(struct ioctl_disk_config*)arg);
+		break;
+
+	case DRBD_IOCTL_SET_DISK_SIZE:
+		if (mdev->cstate > Connected) {
+			err = -EBUSY;
+			break;
+		}
+		if ( mdev->state == Secondary && mdev->o_state == Secondary) {
+			err = -EINPROGRESS;
+			break;
+		}
+		err=0;
+		mdev->lo_usize = (unsigned long)arg;
+		drbd_bm_lock(mdev);
+		drbd_determin_dev_size(mdev);
+		drbd_md_write(mdev); // Write mdev->la_size to disk.
+		drbd_bm_unlock(mdev);
+		if (mdev->cstate == Connected) drbd_send_param(mdev,1);
+		break;
+
+	case DRBD_IOCTL_SET_NET_CONFIG:
+		err = drbd_ioctl_set_net(mdev,(struct ioctl_net_config*) arg);
+		break;
+
+	case DRBD_IOCTL_SET_SYNC_CONFIG:
+		err = drbd_ioctl_set_syncer(mdev,
+					    (struct ioctl_syncer_config*) arg);
+		break;
+
+	case DRBD_IOCTL_GET_CONFIG:
+		err = drbd_ioctl_get_conf(mdev,(struct ioctl_get_config*) arg);
+		break;
+
+	case DRBD_IOCTL_UNCONFIG_NET:
+		if ( mdev->cstate == Unconfigured) break;
+		if (  (   mdev->state  == Primary
+		       && test_bit(DISKLESS,&mdev->flags) )
+		   || (   mdev->o_state == Primary
+		       && !test_bit(PARTNER_CONSISTENT,&mdev->flags) ) )
+		{
+			err=-ENODATA;
+			break;
+		}
+		/* FIXME what if fsync returns error */
+		drbd_sync_me(mdev);
+		set_bit(DO_NOT_INC_CONCNT,&mdev->flags);
+		set_cstate(mdev,Unconnected);
+		drbd_thread_stop(&mdev->receiver);
+
+		if (test_bit(DISKLESS,&mdev->flags)) {
+			set_cstate(mdev,Unconfigured);
+			drbd_mdev_cleanup(mdev);
+			module_put(THIS_MODULE);
+		} else set_cstate(mdev,StandAlone);
+
+		break;
+
+	case DRBD_IOCTL_UNCONFIG_DISK:
+		if (mdev->cstate == Unconfigured) break;
+		err = drbd_detach_ioctl(mdev);
+		break;
+
+	case DRBD_IOCTL_WAIT_CONNECT:
+		wp=(struct ioctl_wait*)arg;
+		if( (err=drbd_get_wait_time(&time,mdev,wp)) ) break;
+
+		// We can drop the mutex, we do not touch anything in mdev.
+		up(&mdev->device_mutex);
+
+		time = wait_event_interruptible_timeout(
+			mdev->cstate_wait,
+			mdev->cstate < Unconnected
+			|| mdev->cstate >= Connected,
+			time );
+		if (time < 0) {
+			err = time;
+			goto out_unlocked;
+		}
+		if (time == 0) {
+			err = -ETIME;
+			goto out_unlocked;
+		}
+		err=0; // no error
+
+		if(put_user(mdev->cstate>=Connected,&wp->ret_code))err=-EFAULT;
+		goto out_unlocked;
+
+	case DRBD_IOCTL_WAIT_SYNC:
+		wp=(struct ioctl_wait*)arg;
+		if( (err=drbd_get_wait_time(&time,mdev,wp)) ) break;
+
+		up(&mdev->device_mutex);
+
+		do {
+			time = wait_event_interruptible_timeout(
+				mdev->cstate_wait,
+				mdev->cstate == Connected
+				|| mdev->cstate < Unconnected,
+				time );
+
+			if (time < 0 ) {
+				err = time;
+				goto out_unlocked;
+			}
+
+			if (mdev->cstate > Connected) {
+				time=MAX_SCHEDULE_TIMEOUT;
+			}
+
+			if (time == 0) {
+				err = -ETIME;
+				goto out_unlocked;
+			}
+		} while ( mdev->cstate != Connected
+			  && mdev->cstate >= Unconnected );
+
+		err=0; // no error
+
+		if(put_user(mdev->cstate==Connected,&wp->ret_code))err=-EFAULT;
+		goto out_unlocked;
+
+	case DRBD_IOCTL_INVALIDATE:
+		/* TODO
+		 * differentiate between different error cases,
+		 * or report the current connection state and flags back
+		 * to userspace */
+
+		/* disallow "invalidation" of local replica
+		 * when currently in primary state (would be a Bad Idea),
+		 * or during a running sync (won't make any sense) */
+		if( (mdev->state == Primary ||
+		      (mdev->cstate != Connected &&
+		       mdev->cstate != StandAlone)) ||
+		    test_bit(DISKLESS,&mdev->flags) ||
+		    test_bit(PARTNER_DISKLESS,&mdev->flags) ) {
+			err = -EINPROGRESS;
+			break;
+		}
+
+		drbd_md_set_flag(mdev,MDF_FullSync);
+		drbd_md_clear_flag(mdev,MDF_Consistent);
+		drbd_md_write(mdev);
+
+		if (mdev->cstate == Connected) {
+			/* avoid races with set_in_sync
+			 * for successfull mirrored writes
+			 */
+			set_cstate(mdev,WFBitMapT);
+			wait_event(mdev->cstate_wait,
+				   atomic_read(&mdev->ap_bio_cnt)==0);
+		}
+
+		drbd_bm_lock(mdev); // racy...
+
+		drbd_bm_set_all(mdev);
+		drbd_bm_write(mdev);
+
+		drbd_md_clear_flag(mdev,MDF_FullSync);
+		drbd_md_write(mdev);
+
+		if (mdev->cstate >= Connected) {
+			drbd_send_short_cmd(mdev,BecomeSyncSource);
+			drbd_start_resync(mdev,SyncTarget);
+		}
+
+		drbd_bm_unlock(mdev);
+
+		break;
+
+	case DRBD_IOCTL_INVALIDATE_REM:
+		if( mdev->o_state == Primary ||
+		    mdev->cstate != Connected ||
+		    test_bit(DISKLESS,&mdev->flags) ||
+		    test_bit(PARTNER_DISKLESS,&mdev->flags) ) {
+			err = -EINPROGRESS;
+			break;
+		}
+		if ( !drbd_md_test_flag(mdev,MDF_Consistent) ) {
+			// FIXME use a more descriptive error number
+			err = -EINVAL;
+			break;
+		}
+
+		drbd_md_set_flag(mdev,MDF_FullSync);
+		drbd_md_write(mdev);
+
+		/* avoid races with set_in_sync
+		 * for successfull mirrored writes
+		 */
+		set_cstate(mdev,WFBitMapS);
+		wait_event(mdev->cstate_wait,
+		     atomic_read(&mdev->ap_bio_cnt)==0);
+
+		drbd_bm_lock(mdev); // racy...
+
+		drbd_bm_set_all(mdev);
+		drbd_bm_write(mdev);
+
+		drbd_md_clear_flag(mdev,MDF_FullSync);
+		drbd_md_write(mdev);
+
+		drbd_send_short_cmd(mdev,BecomeSyncTarget);
+		drbd_start_resync(mdev,SyncSource);
+
+		drbd_bm_unlock(mdev);
+
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+ /* out: */
+	up(&mdev->device_mutex);
+ out_unlocked:
+	return err;
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_int.h	2006-02-09 15:39:21.000000000 +0300
@@ -0,0 +1,1564 @@
+/*
+  drbd_int.h
+  Kernel module for 2.4.x/2.6.x Kernels
+
+  This file is part of drbd by Philipp Reisner.
+
+  Copyright (C) 1999-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+	main author.
+
+  Copyright (C) 2002-2004, Lars Ellenberg <l.g.e@web.de>.
+	main contributor.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+
+#include "lru_cache.h"
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8)
+# define HAVE_KERNEL_SENDMSG 1
+#else
+# define HAVE_KERNEL_SENDMSG 0
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+#include "mempool.h"
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+#endif
+
+// module parameter, defined in drbd_main.c
+extern int minor_count;
+extern int disable_bd_claim;
+extern int major_nr;
+extern int use_nbd_major;
+
+// use_nbd_major ? "nbd" : "drbd";
+extern char* drbd_devfs_name;
+
+#include <linux/major.h>
+#ifdef DRBD_MAJOR
+# warning "FIXME. DRBD_MAJOR is now officially defined in major.h"
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+/*lge: this hack is to get rid of the compiler warnings about
+ * 'do_nbd_request declared static but never defined'
+ * whilst forcing blk.h defines on
+ * though we probably do not need them, we do not use them...
+ * would not work without LOCAL_END_REQUEST
+ */
+# define MAJOR_NR DRBD_MAJOR
+# define DEVICE_ON(device)
+# define DEVICE_OFF(device)
+# define DEVICE_NR(device) (MINOR(device))
+# define LOCAL_END_REQUEST
+# include <linux/blk.h>
+# define DRBD_MAJOR major_nr
+#else
+# include <linux/blkdev.h>
+# include <linux/bio.h>
+# define MAJOR_NR major_nr
+#endif
+
+#undef DEVICE_NAME
+#define DEVICE_NAME "drbd"
+
+// XXX do we need this?
+#ifndef TRUE
+#define TRUE 1
+#endif
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+#define INITIAL_BLOCK_SIZE (1<<12)  // 4K
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ *
+ * FIXME btw, we should register some reboot notifier.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+#define ID_SYNCER (-1LL)
+#define ID_VACANT 0     // All EEs on the free list should have this value
+                        // freshly allocated EEs get !ID_VACANT (== 1)
+			// so if it says "cannot dereference null
+			// pointer at adress 0x00000001, it is most
+			// probably one of these :(
+
+struct Drbd_Conf;
+typedef struct Drbd_Conf drbd_dev;
+
+#ifdef DBG_ALL_SYMBOLS
+# define STATIC
+#else
+# define STATIC static
+#endif
+
+#ifdef PARANOIA
+# define PARANOIA_BUG_ON(x) BUG_ON(x)
+#else
+# define PARANOIA_BUG_ON(x)
+#endif
+
+/*
+ * Some Message Macros
+ *************************/
+
+// handy macro: DUMPP(somepointer)
+#define DUMPP(A)   ERR( #A " = %p in %s:%d\n",  (A),__FILE__,__LINE__);
+#define DUMPLU(A)  ERR( #A " = %lu in %s:%d\n", (A),__FILE__,__LINE__);
+#define DUMPLLU(A) ERR( #A " = %llu in %s:%d\n",(A),__FILE__,__LINE__);
+#define DUMPLX(A)  ERR( #A " = %lx in %s:%d\n", (A),__FILE__,__LINE__);
+#define DUMPI(A)   ERR( #A " = %d in %s:%d\n",  (A),__FILE__,__LINE__);
+
+#define DUMPST(A) DUMPLLU((unsigned long long)(A))
+
+
+// Info: do not remove the spaces around the "," before ##
+//       Otherwise this is not portable from gcc-2.95 to gcc-3.3
+#define PRINTK(level,fmt,args...) \
+	printk(level DEVICE_NAME "%d: " fmt, \
+		(int)(mdev-drbd_conf) , ##args)
+
+#define ALERT(fmt,args...) PRINTK(KERN_ALERT, fmt , ##args)
+#define ERR(fmt,args...)  PRINTK(KERN_ERR, fmt , ##args)
+#define WARN(fmt,args...) PRINTK(KERN_WARNING, fmt , ##args)
+#define INFO(fmt,args...) PRINTK(KERN_INFO, fmt , ##args)
+#define DBG(fmt,args...)  PRINTK(KERN_DEBUG, fmt , ##args)
+
+/* see kernel/printk.c:printk_ratelimit
+ * macro, so it is easy do have independend rate limits at different locations
+ * "initializer element not constant ..." with kernel 2.4 :(
+ * so I initialize toks to something large
+ */
+#define DRBD_ratelimit(ratelimit_jiffies,ratelimit_burst)	\
+({								\
+	int __ret;						\
+	static unsigned long toks = 0x80000000UL;		\
+	static unsigned long last_msg;				\
+	static int missed;					\
+	unsigned long now = jiffies;				\
+	toks += now - last_msg;					\
+	last_msg = now;						\
+	if (toks > (ratelimit_burst * ratelimit_jiffies))	\
+		toks = ratelimit_burst * ratelimit_jiffies;	\
+	if (toks >= ratelimit_jiffies) {			\
+		int lost = missed;				\
+		missed = 0;					\
+		toks -= ratelimit_jiffies;			\
+		if (lost)					\
+			WARN("%d messages suppressed in %s:%d.\n",\
+				lost , __FILE__ , __LINE__ );	\
+		__ret=1;					\
+	} else {						\
+		missed++;					\
+		__ret=0;					\
+	}							\
+	__ret;							\
+})
+
+
+#ifdef DBG_ASSERTS
+extern void drbd_assert_breakpoint(drbd_dev*, char *, char *, int );
+# define D_ASSERT(exp)  if (!(exp)) \
+	 drbd_assert_breakpoint(mdev,#exp,__FILE__,__LINE__)
+#else
+# define D_ASSERT(exp)  if (!(exp)) \
+	 ERR("ASSERT( " #exp " ) in %s:%d\n", __FILE__,__LINE__)
+#endif
+#define ERR_IF(exp) if (({ \
+	int _b = (exp)!=0; \
+	if (_b) ERR("%s: (" #exp ") in %s:%d\n", __func__, __FILE__,__LINE__); \
+	 _b; \
+	}))
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,9)
+#include <linux/stringify.h>
+#else
+// RH 2.4.9 does not have linux/stringify.h
+#define __stringify_1(x)	#x
+#define __stringify(x)		__stringify_1(x)
+#endif
+
+// integer division, round _UP_ to the next integer
+#define div_ceil(A,B) ( (A)/(B) + ((A)%(B) ? 1 : 0) )
+// usual integer division
+#define div_floor(A,B) ( (A)/(B) )
+
+/*
+ * Compatibility Section
+ *************************/
+
+#include "drbd_compat_types.h"
+
+#ifdef SIGHAND_HACK
+# define LOCK_SIGMASK(task,flags)   spin_lock_irqsave(&task->sighand->siglock, flags)
+# define UNLOCK_SIGMASK(task,flags) spin_unlock_irqrestore(&task->sighand->siglock, flags)
+# define RECALC_SIGPENDING()        recalc_sigpending();
+#else
+# define LOCK_SIGMASK(task,flags)   spin_lock_irqsave(&task->sigmask_lock, flags)
+# define UNLOCK_SIGMASK(task,flags) spin_unlock_irqrestore(&task->sigmask_lock, flags)
+# define RECALC_SIGPENDING()        recalc_sigpending(current);
+#endif
+
+#if defined(DBG_SPINLOCKS) && defined(__SMP__)
+# define MUST_HOLD(lock) if(!spin_is_locked(lock)) { ERR("Not holding lock! in %s\n", __FUNCTION__ ); }
+#else
+# define MUST_HOLD(lock)
+#endif
+
+/*
+ * our structs
+ *************************/
+
+#ifndef typecheck
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+ */
+#define typecheck(type,x) \
+({	type __dummy; \
+	typeof(x) __dummy2; \
+	(void)(&__dummy == &__dummy2); \
+	1; \
+})
+#endif
+
+#define SET_MAGIC(x)       ((x)->magic = (long)(x) ^ DRBD_MAGIC)
+#define VALID_POINTER(x)   ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)):0)
+#define INVALIDATE_MAGIC(x) (x->magic--)
+
+#define SET_MDEV_MAGIC(x) \
+	({ typecheck(struct Drbd_Conf*,x); \
+	  (x)->magic = (long)(x) ^ DRBD_MAGIC; })
+#define IS_VALID_MDEV(x)  \
+	( typecheck(struct Drbd_Conf*,x) && \
+	  ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)):0))
+
+
+/*
+ * GFP_DRBD is used for allocations inside drbd_make_request,
+ * and for the sk->allocation scheme.
+ *
+ * Try to get away with GFP_NOIO, which is
+ * in 2.4.x:	(__GFP_HIGH | __GFP_WAIT) // HIGH == EMERGENCY, not HIGHMEM!
+ * in 2.6.x:	             (__GFP_WAIT)
+ *
+ * As far as i can see we do not allocate from interrupt context...
+ * if we do, we certainly should fix that.
+ * - lge
+ */
+#define GFP_DRBD GFP_NOIO
+
+/* these defines should go into blkdev.h
+   (if it will be ever includet into linus' linux) */
+#define RQ_DRBD_NOTHING	  0x0001
+#define RQ_DRBD_SENT      0x0010
+#define RQ_DRBD_LOCAL     0x0020
+#define RQ_DRBD_DONE      0x0030
+#define RQ_DRBD_IN_TL     0x0040
+
+enum MetaDataFlags {
+	__MDF_Consistent,
+	__MDF_PrimaryInd,
+	__MDF_ConnectedInd,
+	__MDF_FullSync,
+};
+#define MDF_Consistent      (1<<__MDF_Consistent)
+#define MDF_PrimaryInd      (1<<__MDF_PrimaryInd)
+#define MDF_ConnectedInd    (1<<__MDF_ConnectedInd)
+#define MDF_FullSync        (1<<__MDF_FullSync)
+
+/* drbd_meta-data.c (still in drbd_main.c) */
+enum MetaDataIndex {
+	Flags,          /* Consistency flag,connected-ind,primary-ind */
+	HumanCnt,       /* human-intervention-count */
+	TimeoutCnt,     /* timout-count */
+	ConnectedCnt,   /* connected-count */
+	ArbitraryCnt,   /* arbitrary-count */
+	GEN_CNT_SIZE	// MUST BE LAST! (and Flags must stay first...)
+};
+
+#define DRBD_MD_MAGIC (DRBD_MAGIC+3) // 3nd incarnation of the file format.
+
+#define DRBD_PANIC 2
+/* do_panic alternatives:
+ *	0: panic();
+ *	1: machine_halt; SORRY, this DOES NOT WORK
+ *	2: prink(EMERG ), plus flag to fail all eventual drbd IO, plus panic()
+ */
+
+extern volatile int drbd_did_panic;
+
+#if    DRBD_PANIC == 0
+#define drbd_panic(fmt, args...) \
+	panic(DEVICE_NAME "%d: " fmt, (int)(mdev-drbd_conf) , ##args)
+#elif  DRBD_PANIC == 1
+#error "sorry , this does not work, please contribute"
+#else
+#define drbd_panic(fmt, args...) do {					\
+	printk(KERN_EMERG DEVICE_NAME "%d: " fmt,			\
+			(int)(mdev-drbd_conf) , ##args);		\
+	drbd_did_panic = DRBD_MAGIC;					\
+	smp_mb();							\
+	panic(DEVICE_NAME "%d: " fmt, (int)(mdev-drbd_conf) , ##args);	\
+} while (0)
+#endif
+#undef DRBD_PANIC
+
+/***
+ * on the wire
+ *********************************************************************/
+
+typedef enum {
+	Data,
+	DataReply,     // Response to DataRequest
+	RSDataReply,   // Response to RSDataRequest
+	Barrier,
+	ReportParams,
+	ReportBitMap,
+	BecomeSyncTarget,
+	BecomeSyncSource,
+	UnplugRemote,  // Used at various times to hint the peer to hurry up
+	DataRequest,   // Used to ask for a data block
+	RSDataRequest, // Used to ask for a data block
+	SyncParam,
+
+	Ping,         // These are sent on the meta socket...
+	PingAck,
+	RecvAck,      // Used in protocol B
+	WriteAck,     // Used in protocol C
+	NegAck,       // Sent if local disk is unusable
+	NegDReply,    // Local disk is broken...
+	NegRSDReply,  // Local disk is broken...
+	BarrierAck,
+
+	MAX_CMD,
+	MayIgnore = 0x100, // Flag only to test if (cmd > MayIgnore) ...
+	MAX_OPT_CMD,
+
+	HandShake = 0xfffe // FIXED for the next century!
+} Drbd_Packet_Cmd;
+
+static inline const char* cmdname(Drbd_Packet_Cmd cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+		[Data]             = "Data",
+		[DataReply]        = "DataReply",
+		[RSDataReply]      = "RSDataReply",
+		[Barrier]          = "Barrier",
+		[ReportParams]     = "ReportParams",
+		[ReportBitMap]     = "ReportBitMap",
+		[BecomeSyncTarget] = "BecomeSyncTarget",
+		[BecomeSyncSource] = "BecomeSyncSource",
+		[UnplugRemote]     = "UnplugRemote",
+		[DataRequest]      = "DataRequest",
+		[RSDataRequest]    = "RSDataRequest",
+		[SyncParam]        = "SyncParam",
+		[Ping]             = "Ping",
+		[PingAck]          = "PingAck",
+		[RecvAck]          = "RecvAck",
+		[WriteAck]         = "WriteAck",
+		[NegAck]           = "NegAck",
+		[NegDReply]        = "NegDReply",
+		[NegRSDReply]      = "NegRSDReply",
+		[BarrierAck]       = "BarrierAck"
+	};
+
+	if (cmd == HandShake) return "HandShake";
+	if (Data > cmd || cmd >= MAX_CMD) return "Unknown";
+	return cmdnames[cmd];
+}
+
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ *     (except block_id and barrier fields.
+ *      these are pointers to local structs
+ *      and have no relevance for the partner,
+ *      which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+typedef struct {
+	u32       magic;
+	u16       command;
+	u16       length;	// bytes of data after this header
+	char      payload[0];
+} __attribute((packed)) Drbd_Header;
+// 8 bytes. packet FIXED for the next century!
+
+/*
+ * short commands, packets without payload, plain Drbd_Header:
+ *   Ping
+ *   PingAck
+ *   BecomeSyncTarget
+ *   BecomeSyncSource
+ *   UnplugRemote
+ */
+
+/*
+ * commands with out-of-struct payload:
+ *   ReportBitMap    (no additional fields)
+ *   Data, DataReply (see Drbd_Data_Packet)
+ */
+typedef struct {
+	Drbd_Header head;
+	u64         sector;    // 64 bits sector number
+	u64         block_id;  // Used in protocol B&C for the address of the req.
+} __attribute((packed)) Drbd_Data_Packet;
+
+/*
+ * commands which share a struct:
+ *   RecvAck (proto B), WriteAck (proto C) (see Drbd_BlockAck_Packet)
+ *   DataRequest, RSDataRequest  (see Drbd_BlockRequest_Packet)
+ */
+typedef struct {
+	Drbd_Header head;
+	u64         sector;
+	u64         block_id;
+	u32         blksize;
+	u32         pad;	//make sure packet is a multiple of 8 Byte
+} __attribute((packed)) Drbd_BlockAck_Packet;
+
+typedef struct {
+	Drbd_Header head;
+	u64         sector;
+	u64         block_id;
+	u32         blksize;
+	u32         pad;	//make sure packet is a multiple of 8 Byte
+} __attribute((packed)) Drbd_BlockRequest_Packet;
+
+/*
+ * commands with their own struct for additional fields:
+ *   HandShake
+ *   Barrier
+ *   BarrierAck
+ *   SyncParam
+ *   ReportParams
+ */
+
+typedef struct {
+	Drbd_Header head;		// 8 bytes
+	u32         protocol_version;
+	u32         feature_flags;
+
+	/* should be more than enough for future enhancements
+	 * for now, feature_flags and the reserverd array shall be zero.
+	 */
+
+	u64         reserverd[8];
+} __attribute((packed)) Drbd_HandShake_Packet;
+// 80 bytes, FIXED for the next century
+
+typedef struct {
+	Drbd_Header head;
+	u32         barrier;   // may be 0 or a barrier number
+	u32         pad;	//make sure packet is a multiple of 8 Byte
+} __attribute((packed)) Drbd_Barrier_Packet;
+
+typedef struct {
+	Drbd_Header head;
+	u32         barrier;
+	u32         set_size;
+} __attribute((packed)) Drbd_BarrierAck_Packet;
+
+typedef struct {
+	Drbd_Header head;
+	u32         rate;
+	u32         use_csums;
+	u32         skip;
+	u32         group;
+} __attribute((packed)) Drbd_SyncParam_Packet;
+
+/* FIXME add more members here, until we introduce a new fixed size
+ * protocol version handshake packet! */
+typedef struct {
+	Drbd_Header head;
+	u64         p_size;  // size of disk
+	u64         u_size;  // user requested size
+	u32         state;
+	u32         protocol;
+	u32         version;
+	u32         gen_cnt[GEN_CNT_SIZE];
+	u32         sync_rate;
+	u32         sync_use_csums;
+	u32         skip_sync;
+	u32         sync_group;
+	u32         flags;   // flags & 1 -> reply call drbd_send_param(mdev);
+	u32         magic;   //make sure packet is a multiple of 8 Byte
+} __attribute((packed)) Drbd_Parameter_Packet;
+
+typedef struct {
+	u64       size;
+	u32       state;
+	u32       blksize;
+	u32       protocol;
+	u32       version;
+	u32       gen_cnt[5];
+	u32       bit_map_gen[5];
+} __attribute((packed)) Drbd06_Parameter_P;
+
+typedef union {
+	Drbd_Header              head;
+	Drbd_HandShake_Packet    HandShake;
+	Drbd_Data_Packet         Data;
+	Drbd_BlockAck_Packet     BlockAck;
+	Drbd_Barrier_Packet      Barrier;
+	Drbd_BarrierAck_Packet   BarrierAck;
+	Drbd_SyncParam_Packet    SyncParam;
+	Drbd_Parameter_Packet    Parameter;
+	Drbd_BlockRequest_Packet BlockRequest;
+} __attribute((packed)) Drbd_Polymorph_Packet;
+
+/**********************************************************************/
+
+typedef enum {
+	None,
+	Running,
+	Exiting,
+	Restarting
+} Drbd_thread_state;
+
+struct Drbd_thread {
+	spinlock_t t_lock;
+	struct task_struct *task;
+	struct completion startstop;
+	Drbd_thread_state t_state;
+	int (*function) (struct Drbd_thread *);
+	drbd_dev *mdev;
+};
+
+static inline Drbd_thread_state get_t_state(struct Drbd_thread *thi)
+{
+	/* THINK testing the t_state seems to be uncritical in all cases
+	 * (but thread_{start,stop}), so we can read it *without* the lock.
+	 * 	--lge */
+
+	smp_rmb();
+	return (volatile int)thi->t_state;
+}
+
+
+/*
+ * Having this as the first member of a struct provides sort of "inheritance".
+ * "derived" structs can be "drbd_queue_work()"ed.
+ * The callback should know and cast back to the descendant struct.
+ * drbd_request and Tl_epoch_entry are descendants of drbd_work.
+ */
+struct drbd_work;
+typedef int (*drbd_work_cb)(drbd_dev*, struct drbd_work*, int cancel);
+struct drbd_work {
+	struct list_head list;
+	drbd_work_cb cb;
+};
+
+/*
+ * since we eventually don't want to "remap" any bhs, but allways need a
+ * private bh, it may as well be part of the struct so we do not need to
+ * allocate it separately.  it is only used as a clone, and since we own it, we
+ * can abuse certain fields of if for our own needs.  and, since it is part of
+ * the struct, we can use b_private for other things than the req, e.g. mdev,
+ * since we get the request struct by means of the "container_of()" macro.
+ *	-lge
+ */
+
+struct drbd_barrier;
+struct drbd_request {
+	struct drbd_work w;
+	long magic;
+	int rq_status;
+	struct drbd_barrier *barrier; // The next barrier.
+	drbd_bio_t *master_bio;       // master bio pointer
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+	drbd_bio_t private_bio;       // private bio struct
+#else
+	struct bio *private_bio;
+	drbd_dev *mdev;
+#endif
+};
+
+struct drbd_barrier {
+	struct list_head requests; // requests before
+	struct drbd_barrier *next; // pointer to the next barrier
+	int br_number;  // the barriers identifier.
+	int n_req;      // number of requests attached before this barrier
+};
+
+typedef struct drbd_request drbd_request_t;
+
+/* These Tl_epoch_entries may be in one of 6 lists:
+   free_ee   .. free entries
+   active_ee .. data packet being written
+   sync_ee   .. syncer block being written
+   done_ee   .. block written, need to send WriteAck
+   read_ee   .. [RS]DataRequest being read
+*/
+
+/* Since whenever we allocate a Tl_epoch_entry, we allocated a buffer_head,
+ * at the same time, we might as well put it as member into the struct.
+ * Yes, we may "waste" a little memory since the unused EEs on the free_ee list
+ * are somewhat larger. For 2.6, this will be a struct_bio, which is fairly
+ * small, and since we adopt the amount dynamically anyways, this is not an
+ * issue.
+ *
+ * TODO
+ * I'd like to "drop" the free list altogether, since we use mempools, which
+ * are designed for this. We probably would still need a private "page pool"
+ * to do the "bio_add_page" from.
+ *	-lge
+ */
+struct Tl_epoch_entry {
+	struct drbd_work    w;
+	drbd_bio_t private_bio; // private bio struct, NOT a pointer
+	u64    block_id;
+	long magic;
+	ONLY_IN_26(unsigned int ee_size;)
+	ONLY_IN_26(sector_t ee_sector;)
+	// THINK: maybe we rather want bio_alloc(GFP_*,1)
+	ONLY_IN_26(struct bio_vec ee_bvec;)
+};
+
+/* flag bits */
+enum {
+	ISSUE_BARRIER,		// next Data is preceeded by a Barrier
+	SIGNAL_ASENDER,		// whether asender wants to be interrupted
+	SEND_PING,		// whether asender should send a ping asap
+	WRITER_PRESENT,		// somebody opened us with write intent
+	STOP_SYNC_TIMER,	// tell timer to cancel itself
+	DO_NOT_INC_CONCNT,	// well, don't ...
+	ON_PRI_INC_HUMAN,       // When we become primary increase human-count
+	ON_PRI_INC_TIMEOUTEX,   // When " - "  increase timeout-count
+	UNPLUG_QUEUED,		// only relevant with kernel 2.4
+	UNPLUG_REMOTE,		// whether sending a "UnplugRemote" makes sense
+	DISKLESS,		// no local disk
+	PARTNER_DISKLESS,	// partner has no storage
+	PARTNER_CONSISTENT,	// partner has consistent data
+	PROCESS_EE_RUNNING,	// eek!
+	MD_IO_ALLOWED,		// EXPLAIN
+	SENT_DISK_FAILURE,	// sending it once is enough
+	MD_DIRTY,		// current gen counts and flags not yet on disk
+	SYNC_STARTED,		// Needed to agree on the exact point in time..
+	USE_DEGR_WFC_T,		// Use degr-wfc-timeout instad of wfc-timeout.
+};
+
+struct drbd_bitmap; // opaque for Drbd_Conf
+
+// TODO sort members for performance
+// MAYBE group them further
+
+/* THINK maybe we actually want to use the default "event/%s" worker threads
+ * or similar in linux 2.6, which uses per cpu data and threads.
+ *
+ * To be general, this might need a spin_lock member.
+ * For now, please use the mdev->req_lock to protect list_head,
+ * see drbd_queue_work below.
+ */
+struct drbd_work_queue {
+	struct list_head q;
+	struct semaphore s; // producers up it, worker down()s it
+};
+
+/* If Philipp agrees, we remove the "mutex", and make_request will only
+ * (throttle on "queue full" condition and) queue it to the worker thread...
+ * which then is free to do whatever is needed, and has exclusive send access
+ * to the data socket ...
+ */
+struct drbd_socket {
+	struct drbd_work_queue work;
+	struct semaphore  mutex;
+	struct socket    *socket;
+	Drbd_Polymorph_Packet sbuf;  // this way we get our
+	Drbd_Polymorph_Packet rbuf;  // send/receive buffers off the stack
+};
+
+struct Drbd_Conf {
+#ifdef PARANOIA
+	long magic;
+#endif
+	struct net_config conf;
+	struct syncer_config sync_conf;
+	enum io_error_handler on_io_error;
+	struct semaphore device_mutex;
+	struct drbd_socket data; // for data/barrier/cstate/parameter packets
+	struct drbd_socket meta; // for ping/ack (metadata) packets
+	volatile unsigned long last_received; // in jiffies, either socket
+	volatile unsigned int ko_count;
+	struct drbd_work  resync_work,
+			  barrier_work,
+			  unplug_work;
+	struct timer_list resync_timer;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+	kdev_t backing_bdev;  // backing device
+	kdev_t this_bdev;
+	kdev_t md_bdev;       // device for meta-data.
+#else
+	struct block_device *backing_bdev;
+	struct block_device *this_bdev;
+	struct block_device *md_bdev;
+	struct gendisk      *vdisk;
+	request_queue_t     *rq_queue;
+#endif
+	// THINK is this the same in 2.6.x ??
+	struct file *lo_file;
+	struct file *md_file;
+	int md_index;
+	sector_t lo_usize;   /* user provided size */
+	sector_t p_size;     /* partner's disk size */
+	Drbd_State state;
+	volatile Drbd_CState cstate;
+	wait_queue_head_t cstate_wait; // TODO Rename into "misc_wait". 
+	Drbd_State o_state;
+	sector_t la_size; // last agreed disk size
+	unsigned int send_cnt;
+	unsigned int recv_cnt;
+	unsigned int read_cnt;
+	unsigned int writ_cnt;
+	unsigned int al_writ_cnt;
+	unsigned int bm_writ_cnt;
+	atomic_t ap_bio_cnt;     // Requests we need to complete
+	atomic_t ap_pending_cnt; // AP data packets on the wire, ack expected
+	atomic_t rs_pending_cnt; // RS request/data packets on the wire
+	atomic_t unacked_cnt;    // Need to send replys for
+	atomic_t local_cnt;      // Waiting for local disk to signal completion
+	spinlock_t req_lock;
+	spinlock_t tl_lock;
+	struct drbd_barrier* newest_barrier;
+	struct drbd_barrier* oldest_barrier;
+	unsigned long flags;
+	struct task_struct *send_task; /* about pid calling drbd_send */
+	spinlock_t send_task_lock;
+	// sector_t rs_left;	   // blocks not up-to-date [unit BM_BLOCK_SIZE]
+	// moved into bitmap->bm_set
+	unsigned long rs_total;    // blocks to sync in this run [unit BM_BLOCK_SIZE]
+	unsigned long rs_start;    // Syncer's start time [unit jiffies]
+	unsigned long rs_paused;   // cumulated time in PausedSyncX state [unit jiffies]
+	unsigned long rs_mark_left;// block not up-to-date at mark [unit BM_BLOCK_SIZE]
+	unsigned long rs_mark_time;// marks's time [unit jiffies]
+	struct Drbd_thread receiver;
+	struct Drbd_thread worker;
+	struct Drbd_thread asender;
+	struct drbd_bitmap* bitmap;
+	struct lru_cache* resync; // Used to track operations of resync...
+	atomic_t resync_locked;   // Number of locked elements in resync LRU
+	int open_cnt;
+	u32 gen_cnt[GEN_CNT_SIZE];
+	atomic_t epoch_size;
+	spinlock_t ee_lock;
+	struct list_head free_ee;   // available
+	struct list_head active_ee; // IO in progress
+	struct list_head sync_ee;   // IO in progress
+	struct list_head done_ee;   // send ack
+	struct list_head read_ee;   // IO in progress
+	struct list_head net_ee;    // zero-copy network send in progress
+	spinlock_t pr_lock;
+	struct list_head app_reads;
+	struct list_head resync_reads;
+	int ee_vacant;
+	int ee_in_use;
+	wait_queue_head_t ee_wait;
+	NOT_IN_26(struct tq_struct write_hint_tq;)
+	struct page *md_io_page;      // one page buffer for md_io
+	struct page *md_io_tmpp;     // in case hardsect != 512 [ s390 only? ]
+	struct semaphore md_io_mutex; // protects the md_io_buffer
+	spinlock_t al_lock;
+	wait_queue_head_t al_wait;
+	struct lru_cache* act_log;     // activity log
+	unsigned int al_tr_number;
+	int al_tr_cycle;
+	int al_tr_pos;     // position of the next transaction in the journal
+};
+
+
+/*
+ * function declarations
+ *************************/
+
+// drbd_main.c
+extern void _set_cstate(drbd_dev* mdev,Drbd_CState cs);
+extern void drbd_thread_start(struct Drbd_thread *thi);
+extern void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait);
+extern void drbd_free_resources(drbd_dev *mdev);
+extern void tl_release(drbd_dev *mdev,unsigned int barrier_nr,
+		       unsigned int set_size);
+extern void tl_clear(drbd_dev *mdev);
+extern int tl_dependence(drbd_dev *mdev, drbd_request_t * item);
+extern void drbd_free_sock(drbd_dev *mdev);
+extern int drbd_send(drbd_dev *mdev, struct socket *sock,
+		     void* buf, size_t size, unsigned msg_flags);
+extern int drbd_send_param(drbd_dev *mdev, int flags);
+extern int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock,
+			  Drbd_Packet_Cmd cmd, Drbd_Header *h,
+			  size_t size, unsigned msg_flags);
+extern int drbd_send_cmd(drbd_dev *mdev, struct socket *sock,
+			  Drbd_Packet_Cmd cmd, Drbd_Header *h, size_t size);
+extern int drbd_send_sync_param(drbd_dev *mdev, struct syncer_config *sc);
+extern int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,
+			   u32 set_size);
+extern int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd,
+			 struct Tl_epoch_entry *e);
+extern int _drbd_send_page(drbd_dev *mdev, struct page *page,
+			   int offset, size_t size);
+extern int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd,
+			   struct Tl_epoch_entry *e);
+extern int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req);
+extern int _drbd_send_barrier(drbd_dev *mdev);
+extern int drbd_send_drequest(drbd_dev *mdev, int cmd,
+			      sector_t sector,int size, u64 block_id);
+extern int drbd_send_bitmap(drbd_dev *mdev);
+extern int _drbd_send_bitmap(drbd_dev *mdev);
+extern void drbd_free_ll_dev(drbd_dev *mdev);
+extern int drbd_io_error(drbd_dev* mdev);
+extern void drbd_mdev_cleanup(drbd_dev *mdev);
+
+// drbd_meta-data.c (still in drbd_main.c)
+extern void drbd_md_write(drbd_dev *mdev);
+extern int drbd_md_read(drbd_dev *mdev);
+extern int drbd_md_compare(drbd_dev *mdev,Drbd_Parameter_Packet *partner);
+extern void drbd_dump_md(drbd_dev *, Drbd_Parameter_Packet *, int );
+// maybe define them below as inline?
+extern void drbd_md_inc(drbd_dev *mdev, enum MetaDataIndex order);
+extern void drbd_md_set_flag(drbd_dev *mdev, int flags);
+extern void drbd_md_clear_flag(drbd_dev *mdev, int flags);
+extern int drbd_md_test_flag(drbd_dev *mdev, int flag);
+
+/* Meta data layout
+   We reserve a 128MB Block (4k aligned)
+   * either at the end of the backing device
+   * or on a seperate meta data device. */
+
+#define MD_RESERVED_SIZE ( 128LU * (1<<10) )  // 128 MB  ( in units of kb )
+// The following numbers are sectors
+#define MD_GC_OFFSET 0
+#define MD_AL_OFFSET 8      // 8 Sectors after start of meta area
+#define MD_AL_MAX_SIZE 64   // = 32 kb LOG  ~ 3776 extents ~ 14 GB Storage
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) //Allows up to about 3.8TB
+
+#define MD_HARDSECT_B    9     // Since the smalles IO unit is usually 512 byte
+#define MD_HARDSECT      (1<<MD_HARDSECT_B)
+
+// activity log
+#define AL_EXTENTS_PT    (MD_HARDSECT-12)/8-1 // 61 ; Extents per 512B sector
+#define AL_EXTENT_SIZE_B 22      // One extent represents 4M Storage
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SIZE_B)
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+// resync bitmap
+// 16MB sized 'bitmap extent' to track syncer usage
+struct bm_extent {
+	struct lc_element lce;
+	int rs_left; //number of bits set (out of sync) in this extent.
+	unsigned long flags;
+};
+
+#define BME_NO_WRITES  0  // bm_extent.flags: no more requests on this one!
+#define BME_LOCKED     1  // bm_extent.flags: syncer active on this one.
+
+// drbd_bitmap.c
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define BM_BLOCK_SIZE_B  12			 //  4k per bit
+#define BM_BLOCK_SIZE    (1<<BM_BLOCK_SIZE_B)
+/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
+ * per sector of on disk bitmap */
+#define BM_EXT_SIZE_B    (BM_BLOCK_SIZE_B + MD_HARDSECT_B + 3 )  // = 24
+#define BM_EXT_SIZE      (1<<BM_EXT_SIZE_B)
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SIZE_B-9))
+#define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SIZE_B-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SIZE_B-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SIZE_B-9))
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SIZE_B - AL_EXTENT_SIZE_B) )
+#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL))
+
+
+/* I want the packet to fit within one page
+ * THINK maybe use a special bitmap header,
+ * including offset and compression scheme and whatnot
+ * Do not use PAGE_SIZE here! Use a architecture agnostic constant!
+ */
+#define BM_PACKET_WORDS     ((4096-sizeof(Drbd_Header))/sizeof(long))
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *                   of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit   0        bit 37   bit 38            bit (512*8)-1
+ *           ...|........|........|.. // ..|........|
+ * sect. 0       `296     `304                     ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )        //   4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+#define DRBD_MAX_SECTORS_BM \
+          ( (MD_RESERVED_SIZE*2LL - MD_BM_OFFSET) * (1LL<<(BM_EXT_SIZE_B-9)) )
+#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
+#elif ( !defined(CONFIG_LBD) ) && ( BITS_PER_LONG == 32 )
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
+#endif
+
+extern int  drbd_bm_init      (drbd_dev *mdev);
+extern int  drbd_bm_resize    (drbd_dev *mdev, sector_t sectors);
+extern void drbd_bm_cleanup   (drbd_dev *mdev);
+extern void drbd_bm_set_all   (drbd_dev *mdev);
+extern void drbd_bm_clear_all (drbd_dev *mdev);
+extern void drbd_bm_reset_find(drbd_dev *mdev);
+extern int  drbd_bm_set_bit   (drbd_dev *mdev, unsigned long bitnr);
+extern int  drbd_bm_test_bit  (drbd_dev *mdev, unsigned long bitnr);
+extern int  drbd_bm_clear_bit (drbd_dev *mdev, unsigned long bitnr);
+extern int  drbd_bm_e_weight  (drbd_dev *mdev, unsigned long enr);
+extern int  drbd_bm_read_sect (drbd_dev *mdev, unsigned long enr);
+extern int  drbd_bm_write_sect(drbd_dev *mdev, unsigned long enr);
+extern void drbd_bm_read      (drbd_dev *mdev);
+extern void drbd_bm_write     (drbd_dev *mdev);
+extern unsigned long drbd_bm_ALe_set_all (drbd_dev *mdev, unsigned long al_enr);
+extern size_t        drbd_bm_words       (drbd_dev *mdev);
+extern sector_t      drbd_bm_capacity    (drbd_dev *mdev);
+extern unsigned long drbd_bm_find_next   (drbd_dev *mdev);
+extern unsigned long drbd_bm_total_weight(drbd_dev *mdev);
+extern int drbd_bm_rs_done(drbd_dev *mdev);
+// for receive_bitmap
+extern void drbd_bm_merge_lel (drbd_dev *mdev, size_t offset, size_t number,
+				unsigned long* buffer);
+// for _drbd_send_bitmap and drbd_bm_write_sect
+extern void drbd_bm_get_lel   (drbd_dev *mdev, size_t offset, size_t number,
+				unsigned long* buffer);
+/*
+ * only used by drbd_bm_read_sect
+extern void drbd_bm_set_lel   (drbd_dev *mdev, size_t offset, size_t number,
+				unsigned long* buffer);
+*/
+
+extern void __drbd_bm_lock    (drbd_dev *mdev, char* file, int line);
+extern void drbd_bm_unlock    (drbd_dev *mdev);
+#define drbd_bm_lock(mdev)    __drbd_bm_lock(mdev, __FILE__, __LINE__ )
+
+
+// drbd_main.c
+extern drbd_dev *drbd_conf;
+extern int minor_count;
+extern kmem_cache_t *drbd_request_cache;
+extern kmem_cache_t *drbd_ee_cache;
+extern mempool_t *drbd_request_mempool;
+
+// drbd_req
+#define ERF_NOTLD    2   /* do not call tl_dependence */
+extern void drbd_end_req(drbd_request_t *, int, int, sector_t);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+extern int drbd_make_request_24(request_queue_t *q, int rw, struct buffer_head *bio);
+#else
+extern int drbd_make_request_26(request_queue_t *q, struct bio *bio);
+#endif
+extern int drbd_read_remote(drbd_dev *mdev, drbd_request_t *req);
+
+// drbd_fs.c
+extern char* ppsize(char* buf, size_t size);
+extern int drbd_determin_dev_size(drbd_dev*);
+extern sector_t drbd_new_dev_size(struct Drbd_Conf*);
+extern int drbd_set_state(drbd_dev *mdev,Drbd_State newstate);
+extern int drbd_ioctl(struct inode *inode, struct file *file,
+		      unsigned int cmd, unsigned long arg);
+extern long drbd_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg);
+
+// drbd_worker.c
+extern int drbd_worker(struct Drbd_thread *thi);
+extern void drbd_alter_sg(drbd_dev *mdev, int ng);
+extern void drbd_start_resync(drbd_dev *mdev, Drbd_CState side);
+extern int drbd_resync_finished(drbd_dev *mdev);
+// maybe rather drbd_main.c ?
+extern int drbd_md_sync_page_io(drbd_dev *mdev, sector_t sector, int rw);
+// worker callbacks
+extern int w_is_app_read         (drbd_dev *, struct drbd_work *, int);
+extern int w_is_resync_read      (drbd_dev *, struct drbd_work *, int);
+extern int w_read_retry_remote   (drbd_dev *, struct drbd_work *, int);
+extern int w_e_end_data_req      (drbd_dev *, struct drbd_work *, int);
+extern int w_e_end_rsdata_req    (drbd_dev *, struct drbd_work *, int);
+extern int w_resync_inactive     (drbd_dev *, struct drbd_work *, int);
+extern int w_resume_next_sg      (drbd_dev *, struct drbd_work *, int);
+extern int w_io_error            (drbd_dev *, struct drbd_work *, int);
+extern int w_try_send_barrier    (drbd_dev *, struct drbd_work *, int);
+extern int w_send_write_hint     (drbd_dev *, struct drbd_work *, int);
+extern int w_make_resync_request (drbd_dev *, struct drbd_work *, int);
+extern void resync_timer_fn(unsigned long data);
+
+// drbd_receiver.c
+extern int drbd_release_ee(drbd_dev* mdev,struct list_head* list);
+extern int drbd_init_ee(drbd_dev* mdev);
+extern void drbd_put_ee(drbd_dev* mdev,struct Tl_epoch_entry *e);
+extern struct Tl_epoch_entry* drbd_get_ee(drbd_dev* mdev);
+extern void drbd_wait_ee(drbd_dev *mdev,struct list_head *head);
+
+// drbd_proc.c
+extern struct proc_dir_entry *drbd_proc;
+extern struct file_operations drbd_proc_fops;
+extern const char* cstate_to_name(Drbd_CState s);
+extern const char* nodestate_to_name(Drbd_State s);
+
+// drbd_actlog.c
+extern void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector);
+extern void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector);
+extern void drbd_rs_complete_io(struct Drbd_Conf *mdev, sector_t sector);
+extern int drbd_rs_begin_io(struct Drbd_Conf *mdev, sector_t sector);
+extern void drbd_rs_cancel_all(drbd_dev* mdev);
+extern void drbd_al_read_log(struct Drbd_Conf *mdev);
+extern void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line);
+#define drbd_set_in_sync(mdev,sector,size) \
+	__drbd_set_in_sync(mdev,sector,size, __FILE__, __LINE__ )
+extern void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line);
+#define drbd_set_out_of_sync(mdev,sector,size) \
+	__drbd_set_out_of_sync(mdev,sector,size, __FILE__, __LINE__ )
+extern void drbd_al_apply_to_bm(struct Drbd_Conf *mdev);
+extern void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev);
+extern void drbd_al_shrink(struct Drbd_Conf *mdev);
+
+/*
+ * event macros
+ *************************/
+
+// sched.h does not have it with timeout, so here goes:
+
+#ifndef wait_event_interruptible_timeout
+#define __wait_event_interruptible_timeout(wq, condition, ret)		\
+do {									\
+	wait_queue_t __wait;						\
+	init_waitqueue_entry(&__wait, current);				\
+									\
+	add_wait_queue(&wq, &__wait);					\
+	for (;;) {							\
+		set_current_state(TASK_INTERRUPTIBLE);			\
+		if (condition)						\
+			break;						\
+		if (!signal_pending(current)) {				\
+			ret = schedule_timeout(ret);			\
+			if (!ret)					\
+				break;					\
+			continue;					\
+		}							\
+		ret = -EINTR;						\
+		break;							\
+	}								\
+	current->state = TASK_RUNNING;					\
+	remove_wait_queue(&wq, &__wait);				\
+} while (0)
+
+#define wait_event_interruptible_timeout(wq, condition, timeout)	\
+({									\
+	long __ret = timeout;						\
+	if (!(condition))						\
+		__wait_event_interruptible_timeout(wq, condition, __ret); \
+	__ret;								\
+})
+#endif
+
+/*
+ * inline helper functions
+ *************************/
+
+#include "drbd_compat_wrappers.h"
+
+static inline int drbd_disk_less_node_present(struct Drbd_Conf* mdev)
+{
+	sector_t p_size = mdev->p_size;
+	sector_t m_size = drbd_get_capacity(mdev->backing_bdev);
+
+	return ! ( p_size && m_size ) ;
+}
+
+static inline void
+drbd_flush_signals(struct task_struct *t)
+{
+	NOT_IN_26(
+	unsigned long flags;
+	LOCK_SIGMASK(t,flags);
+	)
+
+	flush_signals(t);
+	NOT_IN_26(UNLOCK_SIGMASK(t,flags));
+}
+
+static inline void set_cstate(drbd_dev* mdev,Drbd_CState ns)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&mdev->req_lock,flags);
+	_set_cstate(mdev,ns);
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+}
+
+/**
+ * drbd_chk_io_error: Handles the on_io_error setting, should be called from
+ * all io completion handlers. See also drbd_io_error().
+ */
+static inline void drbd_chk_io_error(drbd_dev* mdev, int error)
+{
+	if (error) {
+		switch(mdev->on_io_error) {
+		case PassOn:
+			ERR("Ignoring local IO error!\n");
+			break;
+		case Panic:
+			set_bit(DISKLESS,&mdev->flags);
+			smp_mb(); // but why is there smp_mb__after_clear_bit() ?
+			drbd_panic("IO error on backing device!\n");
+			break;
+		case Detach:
+			/*lge:
+			 *  I still do not fully grasp when to set or clear
+			 *  this flag... but I want to be able to at least
+			 *  still _try_ and write the "I am inconsistent, and
+			 *  need full sync" information to the MD. */
+			set_bit(MD_IO_ALLOWED,&mdev->flags);
+			drbd_md_set_flag(mdev,MDF_FullSync);
+			drbd_md_clear_flag(mdev,MDF_Consistent);
+			if (!test_and_set_bit(DISKLESS,&mdev->flags)) {
+				smp_mb(); // Nack is sent in w_e handlers.
+				ERR("Local IO failed. Detaching...\n");
+			}
+			break;
+		}
+	}
+}
+
+static inline int semaphore_is_locked(struct semaphore* s) 
+{
+	if(!down_trylock(s)) {
+		up(s);
+		return 0;
+	}
+	return 1;
+}
+/* Returns the start sector for metadata, aligned to 4K
+ * which happens to be the capacity we announce for
+ * our lower level device if it includes the meta data
+ */
+static inline sector_t drbd_md_ss(drbd_dev *mdev)
+{
+	if( mdev->md_index == -1 ) {
+		if (!mdev->backing_bdev) {
+			if (DRBD_ratelimit(5*HZ,5)) {
+				ERR("mdev->backing_bdev==NULL\n");
+				dump_stack();
+			}
+			return 0;
+		}
+		return (  (drbd_get_capacity(mdev->backing_bdev) & ~7L)
+			- (MD_RESERVED_SIZE<<1) );
+	} else {
+		return 2 * MD_RESERVED_SIZE * mdev->md_index;
+	}
+}
+
+static inline void
+_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	list_add_tail(&w->list,&q->q);
+	up(&q->s);
+}
+
+static inline void
+_drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	list_add(&w->list,&q->q);
+	up(&q->s);
+}
+
+static inline void
+drbd_queue_work_front(drbd_dev *mdev, struct drbd_work_queue *q,
+			struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&mdev->req_lock,flags);
+	list_add(&w->list,&q->q);
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+	up(&q->s);
+}
+
+static inline void
+drbd_queue_work(drbd_dev *mdev, struct drbd_work_queue *q,
+		  struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&mdev->req_lock,flags);
+	list_add_tail(&w->list,&q->q);
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+	up(&q->s);
+}
+
+static inline void wake_asender(drbd_dev *mdev) {
+	if(test_bit(SIGNAL_ASENDER, &mdev->flags)) {
+		force_sig(DRBD_SIG, mdev->asender.task);
+	}
+}
+
+static inline void request_ping(drbd_dev *mdev) {
+	set_bit(SEND_PING,&mdev->flags);
+	wake_asender(mdev);
+}
+
+static inline int drbd_send_short_cmd(drbd_dev *mdev, Drbd_Packet_Cmd cmd)
+{
+	Drbd_Header h;
+	return drbd_send_cmd(mdev,mdev->data.socket,cmd,&h,sizeof(h));
+}
+
+static inline int drbd_send_ping(drbd_dev *mdev)
+{
+	Drbd_Header h;
+	return drbd_send_cmd(mdev,mdev->meta.socket,Ping,&h,sizeof(h));
+}
+
+static inline int drbd_send_ping_ack(drbd_dev *mdev)
+{
+	Drbd_Header h;
+	return drbd_send_cmd(mdev,mdev->meta.socket,PingAck,&h,sizeof(h));
+}
+
+static inline void drbd_thread_stop(struct Drbd_thread *thi)
+{
+	_drbd_thread_stop(thi,FALSE,TRUE);
+}
+
+static inline void drbd_thread_stop_nowait(struct Drbd_thread *thi)
+{
+	_drbd_thread_stop(thi,FALSE,FALSE);
+}
+
+static inline void drbd_thread_restart_nowait(struct Drbd_thread *thi)
+{
+	_drbd_thread_stop(thi,TRUE,FALSE);
+}
+
+static inline void inc_ap_pending(drbd_dev* mdev)
+{
+	atomic_inc(&mdev->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which)				\
+	if(atomic_read(&mdev->which)<0)				\
+		ERR("in %s:%d: " #which " = %d < 0 !\n",	\
+		    __func__ , __LINE__ ,			\
+		    atomic_read(&mdev->which))
+
+#define dec_ap_pending(mdev)					\
+	typecheck(drbd_dev*,mdev);				\
+	if(atomic_dec_and_test(&mdev->ap_pending_cnt))		\
+		wake_up(&mdev->cstate_wait);			\
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt)
+
+static inline void inc_rs_pending(drbd_dev* mdev)
+{
+	atomic_inc(&mdev->rs_pending_cnt);
+}
+
+#define dec_rs_pending(mdev)					\
+	typecheck(drbd_dev*,mdev);				\
+	atomic_dec(&mdev->rs_pending_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt)
+
+static inline void inc_unacked(drbd_dev* mdev)
+{
+	atomic_inc(&mdev->unacked_cnt);
+}
+
+#if 0 && LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/*
+ * idea was to forcefully push the tcp stack whenever the
+ * currently last pending packet is in the buffer.
+ * should be benchmarked on some real box to see if it has any
+ * effect on overall latency.
+ */
+
+/* this only works with 2.6 kernels because of some conflicting defines
+ * in header files included from net.tcp.h.
+ */
+
+#include <net/tcp.h>
+static inline void drbd_push_msock(drbd_dev* mdev)
+{
+	struct sock    *sk;
+	struct tcp_opt *tp;
+	if (mdev->meta.socket == NULL) return;
+	sk = mdev->meta.socket->sk;
+	tp = tcp_sk(sk);
+	lock_sock(sk);
+	__tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), TCP_NAGLE_PUSH);
+	release_sock(sk);
+}
+
+#define dec_unacked(mdev)					\
+	might_sleep();						\
+	typecheck(drbd_dev*,mdev);				\
+	if (atomic_dec_and_test(&mdev->unacked_cnt))		\
+		drbd_push_msock(mdev);				\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt);
+
+#define sub_unacked(mdev, n)					\
+	might_sleep();						\
+	typecheck(drbd_dev*,mdev);				\
+	if (atomic_sub_and_test(n, &mdev->unacked_cnt))		\
+		drbd_push_msock(mdev);				\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt);
+#else
+#define dec_unacked(mdev)					\
+	typecheck(drbd_dev*,mdev);				\
+	atomic_dec(&mdev->unacked_cnt);				\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt)
+
+#define sub_unacked(mdev, n)					\
+	typecheck(drbd_dev*,mdev);				\
+	atomic_sub(n, &mdev->unacked_cnt);			\
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt)
+#endif
+
+
+/**
+ * inc_local: Returns TRUE when local IO is possible. If it returns
+ * TRUE you should call dec_local() after IO is completed.
+ */
+static inline int inc_local(drbd_dev* mdev)
+{
+	int io_allowed;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = !test_bit(DISKLESS,&mdev->flags);
+	if( !io_allowed ) {
+		atomic_dec(&mdev->local_cnt);
+	}
+	return io_allowed;
+}
+
+static inline int inc_local_md_only(drbd_dev* mdev)
+{
+	int io_allowed;
+
+	atomic_inc(&mdev->local_cnt);
+	io_allowed = !test_bit(DISKLESS,&mdev->flags) ||
+		test_bit(MD_IO_ALLOWED,&mdev->flags);
+	if( !io_allowed ) {
+		atomic_dec(&mdev->local_cnt);
+	}
+	return io_allowed;
+}
+
+static inline void dec_local(drbd_dev* mdev)
+{
+	if(atomic_dec_and_test(&mdev->local_cnt) && 
+	   test_bit(DISKLESS,&mdev->flags) &&
+	   mdev->lo_file) {
+		wake_up(&mdev->cstate_wait);
+	}
+
+	D_ASSERT(atomic_read(&mdev->local_cnt)>=0);
+}
+
+static inline void inc_ap_bio(drbd_dev* mdev)
+{
+	atomic_inc(&mdev->ap_bio_cnt);
+}
+
+static inline void dec_ap_bio(drbd_dev* mdev)
+{
+	if(atomic_dec_and_test(&mdev->ap_bio_cnt))
+		wake_up(&mdev->cstate_wait);
+
+	D_ASSERT(atomic_read(&mdev->ap_bio_cnt)>=0);
+}
+
+#ifdef DUMP_EACH_PACKET
+/*
+ * enable to dump information about every packet exchange.
+ */
+#define INFOP(fmt, args...) \
+	INFO("%s:%d: %s [%d] %s %s " fmt , \
+	     file, line, current->comm, current->pid, \
+	     sockname, recv?"<<<":">>>" \
+	     , ## args )
+static inline void
+dump_packet(drbd_dev *mdev, struct socket *sock,
+	    int recv, Drbd_Polymorph_Packet *p, char* file, int line)
+{
+	char *sockname = sock == mdev->meta.socket ? "meta" : "data";
+	int cmd = (recv == 2) ? p->head.command : be16_to_cpu(p->head.command);
+	switch (cmd) {
+	case HandShake:
+		INFOP("%s (%u)\n", be32_to_cpu(p->HandShake.protocol_version));
+		break;
+
+	case Ping:
+	case PingAck:
+	case BecomeSyncTarget:
+	case BecomeSyncSource:
+	case UnplugRemote:
+
+	case SyncParam:
+	case ReportParams:
+		INFOP("%s\n", cmdname(cmd));
+		break;
+
+	case ReportBitMap: /* don't report this */
+		break;
+
+	case Data:
+	case DataReply:
+	case RSDataReply:
+
+	case RecvAck:   /* yes I know. but it is the same layout */
+	case WriteAck:
+	case NegAck:
+
+	case DataRequest:
+	case RSDataRequest:
+		INFOP("%s (%lu,%llx)\n", cmdname(cmd),
+		     (long)be64_to_cpu(p->Data.sector), (long long)p->Data.block_id
+		);
+		break;
+
+	case Barrier:
+	case BarrierAck:
+		INFOP("%s (%u)\n", cmdname(cmd), p->Barrier.barrier);
+		break;
+
+	default:
+		INFOP("%s (%u)\n",cmdname(cmd), cmd);
+		break;
+	}
+}
+#else
+#define dump_packet(ignored...) ((void)0)
+#endif
+
+
+#ifndef sector_div
+# define sector_div(n, b)( \
+{ \
+	int _res; \
+	_res = (n) % (b); \
+	(n) /= (b); \
+	_res; \
+} \
+)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+// this is a direct copy from 2.6.6 include/linux/bitops.h
+
+static inline unsigned long generic_hweight64(u64 w)
+{
+#if BITS_PER_LONG < 64
+	return generic_hweight32((unsigned int)(w >> 32)) +
+				generic_hweight32((unsigned int)w);
+#else
+	u64 res;
+	res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul);
+	res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
+	res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful);
+	res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul);
+	res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul);
+	return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul);
+#endif
+}
+
+static inline unsigned long hweight_long(unsigned long w)
+{
+	return sizeof(w) == 4 ? generic_hweight32(w) : generic_hweight64(w);
+}
+#endif
+
+static inline void drbd_suicide(void)
+{
+#ifdef TASK_ZOMBIE
+	set_current_state(TASK_ZOMBIE);
+#else
+	current->exit_state = EXIT_ZOMBIE;
+#endif
+	schedule();
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_main.c	2006-02-10 15:23:47.000000000 +0300
@@ -0,0 +1,2233 @@
+/*
+-*- Linux-c -*-
+   drbd.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 1999-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+	main author.
+
+   Copyright (C) 2002-2004, Lars Ellenberg <l.g.e@web.de>.
+	main contributor.
+
+   Copyright (C) 2000, Marcelo Tosatti <marcelo@conectiva.com.br>.
+	Early 2.3.x work.
+
+   Copyright (C) 2001, Lelik P.Korchagin <lelik@price.ru>.
+	Initial devfs support.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/drbd_config.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) || defined(HAVE_MM_INLINE_H)
+#include <linux/mm_inline.h>
+#endif
+#include <linux/slab.h>
+#include <linux/devfs_fs_kernel.h>
+
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* YES. We got an official device major from lanana
+ */
+#define LANANA_DRBD_MAJOR 147
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+# if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) || defined(CONFIG_X86_64)
+extern int register_ioctl32_conversion(unsigned int cmd,
+				       int (*handler)(unsigned int,
+						      unsigned int,
+						      unsigned long,
+						      struct file *));
+extern int unregister_ioctl32_conversion(unsigned int cmd);
+extern asmlinkage int sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
+# endif
+#else
+# ifdef CONFIG_COMPAT
+#  if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,10)
+    /* FIXME on which thing could we test instead of the KERNEL_VERSION
+     * again?  register_ioctl32_conversion was deprecated in 2.6.10, got
+     * "officially" deprecated somewhen in 2.6.12, and removed in 2.6.14.
+     * so lets assume all vendor kernels did the transition.  */
+#    define HAVE_COMPAT_IOCTL_MEMBER
+#  else
+#   include <linux/ioctl32.h>
+#  endif
+# endif
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+static devfs_handle_t devfs_handle;
+#endif
+
+int drbdd_init(struct Drbd_thread*);
+int drbd_worker(struct Drbd_thread*);
+int drbd_asender(struct Drbd_thread*);
+
+int drbd_init(void);
+STATIC int drbd_open(struct inode *inode, struct file *file);
+STATIC int drbd_close(struct inode *inode, struct file *file);
+
+#ifdef DEVICE_REQUEST
+#undef DEVICE_REQUEST
+#endif
+#define DEVICE_REQUEST drbd_do_request
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(use_nbd_major, "DEPRECATED! use nbd device major nr (43) "
+		                "instead of the default " __stringify(LANANA_DRBD_MAJOR) );
+MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)");
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+MODULE_PARM(use_nbd_major,"i");
+MODULE_PARM(minor_count,"i");
+#else
+#include <linux/moduleparam.h>
+MODULE_PARM_DESC(disable_bd_claim, "DONT USE! disables block device claiming" );
+/*
+ * please somebody explain to me what the "perm" of the module_param
+ * macro is good for (yes, permission for it in the "driverfs", but what
+ * do we need to do for them to show up, to begin with?)
+ * once I understand this, and the rest of the sysfs stuff, I probably
+ * be able to understand how we can move from our ioctl interface to a
+ * proper sysfs based one.
+ *	-- lge
+ */
+
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * these become boot parameters: [-drbd.major_nr-], drbd.minor_count and
+ * drbd.disable_io_hints
+ */
+module_param(use_nbd_major,   bool,0);
+module_param(minor_count,      int,0);
+module_param(disable_bd_claim,bool,0);
+#endif
+
+// module parameter, defined
+int use_nbd_major = 0;
+int major_nr = LANANA_DRBD_MAJOR;
+#ifdef MODULE
+int minor_count = 2;
+#else
+int minor_count = 8;
+#endif
+int disable_bd_claim = 0;
+
+// devfs name
+char* drbd_devfs_name = "drbd";
+
+
+// global panic flag
+volatile int drbd_did_panic = 0;
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+NOT_IN_26(
+STATIC int *drbd_blocksizes;
+STATIC int *drbd_sizes;
+)
+struct Drbd_Conf *drbd_conf;
+kmem_cache_t *drbd_request_cache;
+kmem_cache_t *drbd_ee_cache;
+mempool_t *drbd_request_mempool;
+
+STATIC struct block_device_operations drbd_ops = {
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,10)
+	.owner =   THIS_MODULE,
+#endif
+	.open =    drbd_open,
+	.release = drbd_close,
+	.ioctl =   drbd_ioctl,
+#ifdef HAVE_COMPAT_IOCTL_MEMBER
+	.compat_ioctl = drbd_compat_ioctl,
+#endif
+};
+
+#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+
+/************************* The transfer log start */
+STATIC int tl_init(drbd_dev *mdev)
+{
+	struct drbd_barrier *b;
+
+	b=kmalloc(sizeof(struct drbd_barrier),GFP_KERNEL);
+	if(!b) return 0;
+	INIT_LIST_HEAD(&b->requests);
+	b->next=0;
+	b->br_number=4711;
+	b->n_req=0;
+
+	mdev->oldest_barrier = b;
+	mdev->newest_barrier = b;
+
+	return 1;
+}
+
+STATIC void tl_cleanup(drbd_dev *mdev)
+{
+	D_ASSERT(mdev->oldest_barrier == mdev->newest_barrier);
+	kfree(mdev->oldest_barrier);
+}
+
+STATIC void tl_add(drbd_dev *mdev, drbd_request_t * new_item)
+{
+	struct drbd_barrier *b;
+
+	spin_lock_irq(&mdev->tl_lock);
+
+	b=mdev->newest_barrier;
+
+	new_item->barrier = b;
+	new_item->rq_status |= RQ_DRBD_IN_TL;
+	list_add(&new_item->w.list,&b->requests);
+
+	if( b->n_req++ > mdev->conf.max_epoch_size ) {
+		set_bit(ISSUE_BARRIER,&mdev->flags);
+	}
+
+	spin_unlock_irq(&mdev->tl_lock);
+}
+
+STATIC void tl_cancel(drbd_dev *mdev, drbd_request_t * item)
+{
+	struct drbd_barrier *b;
+
+	spin_lock_irq(&mdev->tl_lock);
+
+	b=item->barrier;
+	b->n_req--;
+
+	list_del(&item->w.list);
+	item->rq_status &= ~RQ_DRBD_IN_TL;
+
+	spin_unlock_irq(&mdev->tl_lock);
+}
+
+STATIC unsigned int tl_add_barrier(drbd_dev *mdev)
+{
+	unsigned int bnr;
+	static int barrier_nr_issue=1;
+	struct drbd_barrier *b;
+
+	barrier_nr_issue++;
+
+	// THINK this is called in the IO path with the send_mutex held
+	// and GFP_KERNEL may itself start IO. set it to GFP_NOIO.
+	b=kmalloc(sizeof(struct drbd_barrier),GFP_NOIO);
+	if(!b) {
+		ERR("could not kmalloc() barrier\n");
+		return 0;
+	}
+	INIT_LIST_HEAD(&b->requests);
+	b->next=0;
+	b->br_number=barrier_nr_issue;
+	b->n_req=0;
+
+	spin_lock_irq(&mdev->tl_lock);
+
+	bnr = mdev->newest_barrier->br_number;
+	mdev->newest_barrier->next = b;
+	mdev->newest_barrier = b;
+
+	spin_unlock_irq(&mdev->tl_lock);
+
+	return bnr;
+}
+
+void tl_release(drbd_dev *mdev,unsigned int barrier_nr,
+		       unsigned int set_size)
+{
+	struct drbd_barrier *b;
+
+	spin_lock_irq(&mdev->tl_lock);
+
+	b = mdev->oldest_barrier;
+	mdev->oldest_barrier = b->next;
+
+	list_del(&b->requests);
+	/* There could be requests on the list waiting for completion
+	   of the write to the local disk, to avoid corruptions of
+	   slab's data structures we have to remove the lists head */
+
+	spin_unlock_irq(&mdev->tl_lock);
+
+	D_ASSERT(b->br_number == barrier_nr);
+	D_ASSERT(b->n_req == set_size);
+
+	kfree(b);
+}
+
+/* tl_dependence reports if this sector was present in the current
+   epoch.
+   As side effect it clears also the pointer to the request if it
+   was present in the transfert log. (Since tl_dependence indicates
+   that IO is complete and that drbd_end_req() should not be called
+   in case tl_clear has to be called due to interruption of the
+   communication)
+*/
+/* bool */
+int tl_dependence(drbd_dev *mdev, drbd_request_t * item)
+{
+	unsigned long flags;
+	int r=TRUE;
+
+	spin_lock_irqsave(&mdev->tl_lock,flags);
+
+	r = ( item->barrier == mdev->newest_barrier );
+	list_del(&item->w.list);
+
+	spin_unlock_irqrestore(&mdev->tl_lock,flags);
+	return r;
+}
+
+void tl_clear(drbd_dev *mdev)
+{
+	struct list_head *le,*tle;
+	struct drbd_barrier *b,*f,*new_first;
+	struct drbd_request *r;
+	sector_t sector;
+	unsigned int size;
+
+	new_first=kmalloc(sizeof(struct drbd_barrier),GFP_KERNEL);
+	if(!new_first) {
+		ERR("could not kmalloc() barrier\n");
+	}
+
+	INIT_LIST_HEAD(&new_first->requests);
+	new_first->next=0;
+	new_first->br_number=4711;
+	new_first->n_req=0;
+
+	spin_lock_irq(&mdev->tl_lock);
+
+	b=mdev->oldest_barrier;
+	mdev->oldest_barrier = new_first;
+	mdev->newest_barrier = new_first;
+
+	spin_unlock_irq(&mdev->tl_lock);
+
+	inc_ap_pending(mdev); // Since we count the old first as well...
+
+	while ( b ) {
+		list_for_each_safe(le, tle, &b->requests) {
+			r = list_entry(le, struct drbd_request,w.list);
+			// bi_size and bi_sector are modified in bio_endio!
+			sector = drbd_req_get_sector(r);
+			size   = drbd_req_get_size(r);
+			if( !(r->rq_status & RQ_DRBD_SENT) ) {
+				if(mdev->conf.wire_protocol != DRBD_PROT_A )
+					dec_ap_pending(mdev);
+				drbd_end_req(r,RQ_DRBD_SENT,ERF_NOTLD|1, sector);
+				goto mark;
+			}
+			if(mdev->conf.wire_protocol != DRBD_PROT_C ) {
+			mark:
+				drbd_set_out_of_sync(mdev, sector, size);
+			}
+		}
+		f=b;
+		b=b->next;
+		list_del(&f->requests);
+		kfree(f);
+		dec_ap_pending(mdev); // for the barrier
+	}
+}
+
+/**
+ * drbd_io_error: Handles the on_io_error setting, should be called in the
+ * unlikely(!drbd_bio_uptodate(e->bio)) case from kernel thread context.
+ * See also drbd_chk_io_error
+ *
+ * NOTE: we set ourselves DISKLESS here.
+ * But we try to write the "need full sync bit" here anyways.  This is to make sure
+ * that you get a resynchronisation of the full device the next time you
+ * connect.
+ */
+int drbd_io_error(drbd_dev* mdev)
+{
+	int ok=1;
+
+	if(mdev->on_io_error != Panic && mdev->on_io_error != Detach) return 1;
+	if(test_and_set_bit(SENT_DISK_FAILURE,&mdev->flags)) return 1;
+
+	D_ASSERT(test_bit(DISKLESS,&mdev->flags));
+	ok = drbd_send_param(mdev,0);
+	WARN("Notified peer that my disk is broken.\n");
+
+	D_ASSERT(drbd_md_test_flag(mdev,MDF_FullSync));
+	D_ASSERT(!drbd_md_test_flag(mdev,MDF_Consistent));
+	if (test_bit(MD_DIRTY,&mdev->flags)) {
+		// try to get "inconsistent, need full sync" to MD
+		drbd_md_write(mdev);
+	}
+
+	if(mdev->cstate > Connected ) {
+		WARN("Resync aborted.\n");
+		set_cstate(mdev,Connected);
+		mdev->rs_total = 0;
+	}
+	if ( wait_event_interruptible_timeout(mdev->cstate_wait,
+		     atomic_read(&mdev->local_cnt) == 0 , HZ ) <= 0) {
+		WARN("Not releasing backing storage device.\n");
+		/* FIXME if there *are* still references,
+		 * we should be here again soon enough.
+		 * but what if not?
+		 * we still should free our ll and md devices */
+	} else {
+		/* no race. since the DISKLESS bit is set first,
+		 * further references to local_cnt are shortlived,
+		 * and no real references on the device. */
+		WARN("Releasing backing storage device.\n");
+		drbd_free_ll_dev(mdev);
+		mdev->la_size=0;
+	}
+
+	return ok;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,14)
+// daemonize was no global symbol before 2.4.14
+/* in 2.4.6 is is prototyped as
+ * void daemonize(const char *name, ...)
+ * though, so maybe we want to do this for 2.4.x already, too.
+ */
+void daemonize(void)
+{
+	struct fs_struct *fs;
+
+	exit_mm(current);
+
+	current->session = 1;
+	current->pgrp = 1;
+	current->tty = NULL;
+
+	exit_fs(current);       /* current->fs->count--; */
+	fs = init_task.fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+	exit_files(current);
+	current->files = init_task.files;
+	atomic_inc(&current->files->count);
+}
+#endif
+
+STATIC void drbd_daemonize(void) {
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)
+	daemonize("drbd_thread");
+#else
+	daemonize();
+	// VERIFY what about blocking signals ?
+	reparent_to_init();
+#endif
+}
+
+void _set_cstate(drbd_dev* mdev,Drbd_CState ns)
+{
+	Drbd_CState os;
+
+	os = mdev->cstate;
+
+#if DUMP_MD >= 2
+	INFO("%s [%d]: cstate %s --> %s\n", current->comm, current->pid,
+	   cstate_to_name(os), cstate_to_name(ns) );
+#endif
+
+	mdev->cstate = ns;
+	smp_mb();
+	wake_up(&mdev->cstate_wait);
+
+	/* THINK.
+	 * was:
+	 * if ( ( os==SyncSource || os==SyncTarget ) && ns <= Connected ) {
+	 */
+	if ( ( os >= SyncSource ) && ns <= Connected ) {
+		clear_bit(SYNC_STARTED,&mdev->flags);
+		set_bit(STOP_SYNC_TIMER,&mdev->flags);
+		mod_timer(&mdev->resync_timer,jiffies);
+	}
+	if(test_bit(MD_IO_ALLOWED,&mdev->flags) &&
+	   test_bit(DISKLESS,&mdev->flags) && ns < Connected) {
+// FIXME EXPLAIN
+		clear_bit(MD_IO_ALLOWED,&mdev->flags);
+	}
+}
+
+STATIC int drbd_thread_setup(void* arg)
+{
+	struct Drbd_thread *thi = (struct Drbd_thread *) arg;
+	drbd_dev *mdev = thi->mdev;
+	int retval;
+
+	drbd_daemonize();
+	D_ASSERT(get_t_state(thi) == Running);
+	D_ASSERT(thi->task == NULL);
+	spin_lock(&thi->t_lock);
+	thi->task = current;
+	smp_mb();
+	spin_unlock(&thi->t_lock);
+	complete(&thi->startstop); // notify: thi->task is set.
+
+	retval = thi->function(thi);
+
+	spin_lock(&thi->t_lock);
+	thi->task = 0;
+	thi->t_state = Exiting;
+	smp_mb();
+	spin_unlock(&thi->t_lock);
+
+	// THINK maybe two different completions?
+	complete(&thi->startstop); // notify: thi->task unset.
+
+	return retval;
+}
+
+STATIC void drbd_thread_init(drbd_dev *mdev, struct Drbd_thread *thi,
+		      int (*func) (struct Drbd_thread *))
+{
+	thi->t_lock  = SPIN_LOCK_UNLOCKED;
+	thi->task    = NULL;
+	thi->t_state = None;
+	init_completion(&thi->startstop);
+
+	thi->function = func;
+	thi->mdev = mdev;
+}
+
+void drbd_thread_start(struct Drbd_thread *thi)
+{
+	int pid;
+	drbd_dev *mdev = thi->mdev;
+
+	spin_lock(&thi->t_lock);
+
+	/* INFO("%s [%d]: %s %d -> Running\n",
+	     current->comm, current->pid,
+	     thi == &mdev->receiver ? "receiver" :
+             thi == &mdev->asender  ? "asender"  :
+             thi == &mdev->worker   ? "worker"   : "NONSENSE",
+	     thi->t_state); */
+
+	if (thi->t_state == None) {
+		D_ASSERT(thi->task == NULL);
+		thi->t_state = Running;
+		spin_unlock(&thi->t_lock);
+
+		pid = kernel_thread(drbd_thread_setup, (void *) thi, CLONE_FS);
+		if (pid < 0) {
+			ERR("Couldn't start thread (%d)\n", pid);
+			return;
+		}
+		wait_for_completion(&thi->startstop); // waits until thi->task is set
+		D_ASSERT(thi->task);
+		D_ASSERT(get_t_state(thi) == Running);
+	} else {
+		spin_unlock(&thi->t_lock);
+	}
+}
+
+
+void _drbd_thread_stop(struct Drbd_thread *thi, int restart,int wait)
+{
+	drbd_dev *mdev = thi->mdev;
+	Drbd_thread_state ns = restart ? Restarting : Exiting;
+
+	spin_lock(&thi->t_lock);
+
+	/* INFO("%s [%d]: %s %d -> %d; %d\n",
+	     current->comm, current->pid,
+	     thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */
+
+
+	if (thi->t_state == None) {
+		spin_unlock(&thi->t_lock);
+		return;
+	}
+
+	if (thi->t_state != ns) {
+		ERR_IF (thi->task == NULL) {
+			spin_unlock(&thi->t_lock);
+			return;
+		}
+
+		if (ns == Restarting && thi->t_state == Exiting) {
+			// Already Exiting. Cannot restart!
+			spin_unlock(&thi->t_lock);
+			return;
+		}
+
+		thi->t_state = ns;
+		smp_mb();
+		if (thi->task != current)
+			force_sig(DRBD_SIGKILL,thi->task);
+		else
+			D_ASSERT(!wait);
+
+	}
+	spin_unlock(&thi->t_lock);
+
+	if (wait) {
+		D_ASSERT(thi->t_state == Exiting);
+		wait_for_completion(&thi->startstop);
+		spin_lock(&thi->t_lock);
+		thi->t_state = None;
+		smp_mb();
+		D_ASSERT(thi->task == NULL);
+		spin_unlock(&thi->t_lock);
+	}
+}
+
+inline sigset_t drbd_block_all_signals(void)
+{
+	unsigned long flags;
+	sigset_t oldset;
+	LOCK_SIGMASK(current,flags);
+	oldset = current->blocked;
+	sigfillset(&current->blocked);
+	RECALC_SIGPENDING();
+	UNLOCK_SIGMASK(current,flags);
+	return oldset;
+}
+
+inline void restore_old_sigset(sigset_t oldset)
+{
+	unsigned long flags;
+	LOCK_SIGMASK(current,flags);
+	// _never_ propagate this to anywhere...
+	sigdelset(&current->pending.signal, DRBD_SIG);
+	current->blocked = oldset;
+	RECALC_SIGPENDING();
+	UNLOCK_SIGMASK(current,flags);
+}
+
+int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock,
+			  Drbd_Packet_Cmd cmd, Drbd_Header *h,
+			  size_t size, unsigned msg_flags)
+{
+	int sent,ok;
+
+	ERR_IF(!h) return FALSE;
+	ERR_IF(!size) return FALSE;
+
+	h->magic   = BE_DRBD_MAGIC;
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size-sizeof(Drbd_Header));
+
+	dump_packet(mdev,sock,0,(void*)h, __FILE__, __LINE__);
+	sent = drbd_send(mdev,sock,h,size,msg_flags);
+
+	ok = ( sent == size );
+	if(!ok) {
+		ERR("short sent %s size=%d sent=%d\n",
+		    cmdname(cmd), (int)size, sent);
+	}
+	return ok;
+}
+
+int drbd_send_cmd(drbd_dev *mdev, struct socket *sock,
+		  Drbd_Packet_Cmd cmd, Drbd_Header* h, size_t size)
+{
+	int ok;
+	sigset_t old_blocked;
+
+	if (sock == mdev->data.socket) {
+		down(&mdev->data.mutex);
+		spin_lock(&mdev->send_task_lock);
+		mdev->send_task=current;
+		spin_unlock(&mdev->send_task_lock);
+	} else
+		down(&mdev->meta.mutex);
+
+	old_blocked = drbd_block_all_signals();
+	ok = _drbd_send_cmd(mdev,sock,cmd,h,size,0);
+	restore_old_sigset(old_blocked);
+
+	if (sock == mdev->data.socket) {
+		spin_lock(&mdev->send_task_lock);
+		mdev->send_task=NULL;
+		spin_unlock(&mdev->send_task_lock);
+		up(&mdev->data.mutex);
+	} else
+		up(&mdev->meta.mutex);
+	return ok;
+}
+
+int drbd_send_sync_param(drbd_dev *mdev, struct syncer_config *sc)
+{
+	Drbd_SyncParam_Packet p;
+	int ok;
+
+	p.rate      = cpu_to_be32(sc->rate);
+	p.use_csums = cpu_to_be32(sc->use_csums);
+	p.skip      = cpu_to_be32(sc->skip);
+	p.group     = cpu_to_be32(sc->group);
+
+	ok = drbd_send_cmd(mdev,mdev->data.socket,SyncParam,(Drbd_Header*)&p,sizeof(p));
+	if ( ok
+	    && (mdev->cstate == SkippedSyncS || mdev->cstate == SkippedSyncT)
+	    && !sc->skip )
+	{
+		/* FIXME EXPLAIN. I think this cannot work properly! -lge */
+		set_cstate(mdev,WFReportParams);
+		ok = drbd_send_param(mdev,0);
+	}
+	return ok;
+}
+
+int drbd_send_param(drbd_dev *mdev, int flags)
+{
+	Drbd_Parameter_Packet p;
+	int i, ok, have_disk;
+	unsigned long m_size; // sector_t ??
+
+	have_disk=inc_local(mdev);
+	if(have_disk) {
+		D_ASSERT(mdev->backing_bdev);
+		if (mdev->md_index == -1 ) m_size = drbd_md_ss(mdev)>>1;
+		else m_size = drbd_get_capacity(mdev->backing_bdev)>>1;
+	} else m_size = 0;
+
+	p.u_size = cpu_to_be64(mdev->lo_usize);
+	p.p_size = cpu_to_be64(m_size);
+
+	p.state    = cpu_to_be32(mdev->state);
+	p.protocol = cpu_to_be32(mdev->conf.wire_protocol);
+	p.version  = cpu_to_be32(PRO_VERSION);
+
+	for (i = Flags; i < GEN_CNT_SIZE; i++) {
+		p.gen_cnt[i] = cpu_to_be32(mdev->gen_cnt[i]);
+	}
+	p.sync_rate      = cpu_to_be32(mdev->sync_conf.rate);
+	p.sync_use_csums = cpu_to_be32(mdev->sync_conf.use_csums);
+	p.skip_sync      = cpu_to_be32(mdev->sync_conf.skip);
+	p.sync_group     = cpu_to_be32(mdev->sync_conf.group);
+	p.flags          = cpu_to_be32(flags);
+	p.magic          = BE_DRBD_MAGIC;
+
+	ok = drbd_send_cmd(mdev,mdev->data.socket,ReportParams,(Drbd_Header*)&p,sizeof(p));
+	if (have_disk) dec_local(mdev);
+	return ok;
+}
+
+/* See the comment at receive_bitmap() */
+int _drbd_send_bitmap(drbd_dev *mdev)
+{
+	int want;
+	int ok=TRUE, bm_i=0;
+	size_t bm_words, num_words;
+	unsigned long *buffer;
+	Drbd_Header *p;
+
+	ERR_IF(!mdev->bitmap) return FALSE;
+
+	bm_words = drbd_bm_words(mdev);
+	p  = vmalloc(PAGE_SIZE); // sleeps. cannot fail.
+	buffer = (unsigned long*)p->payload;
+
+	if (drbd_md_test_flag(mdev,MDF_FullSync)) {
+		drbd_bm_set_all(mdev);
+		drbd_bm_write(mdev);
+		if (unlikely(test_bit(DISKLESS,&mdev->flags))) {
+			/* write_bm did fail! panic.
+			 * FIXME can we do something better than panic?
+			 */
+			drbd_panic("Failed to write bitmap to disk\n!");
+			ok = FALSE;
+			goto out;
+		}
+		drbd_md_clear_flag(mdev,MDF_FullSync);
+		drbd_md_write(mdev);
+	}
+
+	/*
+	 * maybe TODO use some simple compression scheme, nowadays there are
+	 * some such algorithms in the kernel anyways.
+	 */
+	do {
+		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i );
+		want = num_words * sizeof(long);
+		if (want) {
+			drbd_bm_get_lel(mdev, bm_i, num_words, buffer);
+		}
+		ok = _drbd_send_cmd(mdev,mdev->data.socket,ReportBitMap,
+				   p, sizeof(*p) + want, 0);
+		bm_i += num_words;
+	} while (ok && want);
+
+  out:
+	vfree(p);
+	return ok;
+}
+
+int drbd_send_bitmap(drbd_dev *mdev)
+{
+	int ok;
+	down(&mdev->data.mutex);
+	ok=_drbd_send_bitmap(mdev);
+	up(&mdev->data.mutex);
+	return ok;
+}
+
+int _drbd_send_barrier(drbd_dev *mdev)
+{
+	int ok;
+	Drbd_Barrier_Packet p;
+
+	/* printk(KERN_DEBUG DEVICE_NAME": issuing a barrier\n"); */
+	/* tl_add_barrier() must be called with the sock_mutex aquired */
+	p.barrier=tl_add_barrier(mdev);
+
+	inc_ap_pending(mdev);
+	ok = _drbd_send_cmd(mdev,mdev->data.socket,Barrier,(Drbd_Header*)&p,sizeof(p),0);
+
+//	if (!ok) dec_ap_pending(mdev); // is done in tl_clear()
+	return ok;
+}
+
+int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,u32 set_size)
+{
+	int ok;
+	Drbd_BarrierAck_Packet p;
+
+	p.barrier  = barrier_nr;
+	p.set_size = cpu_to_be32(set_size);
+
+	ok = drbd_send_cmd(mdev,mdev->meta.socket,BarrierAck,(Drbd_Header*)&p,sizeof(p));
+	return ok;
+}
+
+
+int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, struct Tl_epoch_entry *e)
+{
+	int ok;
+	Drbd_BlockAck_Packet p;
+
+	p.sector   = cpu_to_be64(drbd_ee_get_sector(e));
+	p.block_id = e->block_id;
+	p.blksize  = cpu_to_be32(drbd_ee_get_size(e));
+
+	if (!mdev->meta.socket || mdev->cstate < Connected) return FALSE;
+	ok = drbd_send_cmd(mdev,mdev->meta.socket,cmd,(Drbd_Header*)&p,sizeof(p));
+	return ok;
+}
+
+int drbd_send_drequest(drbd_dev *mdev, int cmd,
+		       sector_t sector,int size, u64 block_id)
+{
+	int ok;
+	Drbd_BlockRequest_Packet p;
+
+	p.sector   = cpu_to_be64(sector);
+	p.block_id = block_id;
+	p.blksize  = cpu_to_be32(size);
+
+	ok = drbd_send_cmd(mdev,mdev->data.socket,cmd,(Drbd_Header*)&p,sizeof(p));
+	return ok;
+}
+
+/* called on sndtimeo
+ * returns FALSE if we should retry,
+ * TRUE if we think connection is dead
+ */
+STATIC int we_should_drop_the_connection(drbd_dev *mdev, struct socket *sock)
+{
+	int drop_it;
+	// long elapsed = (long)(jiffies - mdev->last_received);
+	// DUMPLU(elapsed); // elapsed ignored for now.
+
+	drop_it =   mdev->meta.socket == sock
+		|| !mdev->asender.task
+		|| get_t_state(&mdev->asender) != Running
+		|| (volatile int)mdev->cstate < Connected;
+
+	if (drop_it)
+		return TRUE;
+
+	drop_it = !--mdev->ko_count;
+	if ( !drop_it ) {
+		ERR("[%s/%d] sock_sendmsg time expired, ko = %u\n",
+		       current->comm, current->pid, mdev->ko_count);
+		request_ping(mdev);
+	}
+
+	return drop_it; /* && (mdev->state == Primary) */;
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+   to the page into the skb, and to hand it over to the NIC. In
+   this process get_page() gets called.
+
+   As soon as the page was really sent over the network put_page()
+   gets called by some part of the network layer. [ NIC driver? ]
+
+   [ get_page() / put_page() increment/decrement the count. If count
+     reaches 0 the page will be freed. ]
+
+   This works nicely with pages from FSs.
+   But this means that in protocol A we might signal IO completion too early !
+
+   In order not to corrupt data during a resync we must make sure
+   that we do not reuse our own buffer pages (EEs) to early, therefore
+   we have the net_ee list.
+
+   XFS seems to have problems, still, it submits pages with page_count == 0!
+   As a workaround, we disable sendpage on pages with page_count == 0 or PageSlab.
+*/
+int _drbd_no_send_page(drbd_dev *mdev, struct page *page,
+                   int offset, size_t size)
+{
+       int ret;
+       ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0);
+       kunmap(page);
+       return ret;
+}
+
+#ifdef DRBD_DISABLE_SENDPAGE
+int _drbd_send_page(drbd_dev *mdev, struct page *page,
+		    int offset, size_t size)
+{
+	int sent,ok;
+	int len   = size;
+
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=current;
+	spin_unlock(&mdev->send_task_lock);
+
+	sent =  _drbd_no_send_page(mdev, page, offset, size);
+	if (likely(sent > 0)) len -= sent;
+
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=NULL;
+	spin_unlock(&mdev->send_task_lock);
+
+	ok = (len == 0);
+	if (likely(ok))
+		mdev->send_cnt += size>>9;
+	return ok;
+}
+#else
+int _drbd_send_page(drbd_dev *mdev, struct page *page,
+		    int offset, size_t size)
+{
+	mm_segment_t oldfs = get_fs();
+	int sent,ok;
+	int len   = size;
+
+#ifdef SHOW_SENDPAGE_USAGE
+	unsigned long now = jiffies;
+	static unsigned long total = 0;
+	static unsigned long fallback = 0;
+	static unsigned long last_rep = 0;
+
+	/* report statistics every hour,
+	 * if we had at least one fallback.
+	 */
+	++total;
+	if (fallback && time_before(last_rep+3600*HZ, now)) {
+		last_rep = now;
+		printk(KERN_INFO DEVICE_NAME
+		       ": sendpage() omitted: %lu/%lu\n", fallback, total);
+	}
+#endif
+
+
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=current;
+	spin_unlock(&mdev->send_task_lock);
+
+	/* PARANOIA. if this ever triggers,
+	 * something in the layers above us is really kaputt.
+	 *one roundtrip later:
+	 * doh. it triggered. so XFS _IS_ really kaputt ...
+	 * oh well...
+	 */
+	if ( (page_count(page) < 1) || PageSlab(page) ) {
+		/* e.g. XFS meta- & log-data is in slab pages, which have a
+		 * page_count of 0 and/or have PageSlab() set...
+		 */
+#ifdef SHOW_SENDPAGE_USAGE
+		++fallback;
+#endif
+		sent =  _drbd_no_send_page(mdev, page, offset, size);
+		if (likely(sent > 0)) len -= sent;
+		goto out;
+	}
+
+	set_fs(KERNEL_DS);
+	do {
+		sent = mdev->data.socket->ops->sendpage(mdev->data.socket,page,
+							offset,len,
+							MSG_NOSIGNAL);
+		if (sent == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev,
+							  mdev->data.socket))
+				break;
+			else
+				continue;
+		}
+		if (sent <= 0) {
+			WARN("%s: size=%d len=%d sent=%d\n",
+			     __func__,(int)size,len,sent);
+			break;
+		}
+		len    -= sent;
+		offset += sent;
+		// FIXME test "last_received" ...
+	} while(len > 0 /* THINK && mdev->cstate >= Connected*/);
+	set_fs(oldfs);
+
+  out:
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=NULL;
+	spin_unlock(&mdev->send_task_lock);
+
+	ok = (len == 0);
+	if (likely(ok))
+		mdev->send_cnt += size>>9;
+	return ok;
+}
+#endif
+
+// Used to send write requests: bh->b_rsector !!
+int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req)
+{
+	int ok=1;
+	sigset_t old_blocked;
+	Drbd_Data_Packet p;
+
+	ERR_IF(!req || !req->master_bio) return FALSE;
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(Data);
+	p.head.length  = cpu_to_be16( sizeof(p)-sizeof(Drbd_Header)
+				     + drbd_req_get_size(req) );
+
+	p.sector   = cpu_to_be64(drbd_req_get_sector(req));
+	p.block_id = (unsigned long)req;
+
+	/* About tl_add():
+	1. This must be within the semaphor,
+	   to ensure right order in tl_ data structure and to
+	   ensure right order of packets on the write
+	2. This must happen before sending, otherwise we might
+	   get in the BlockAck packet before we have it on the
+	   tl_ datastructure (=> We would want to remove it before it
+	   is there!)
+	3. Q: Why can we add it to tl_ even when drbd_send() might fail ?
+	      There could be a tl_cancel() to remove it within the semaphore!
+	   A: If drbd_send fails, we will loose the connection. Then
+	      tl_cear() will simulate a RQ_DRBD_SEND and set it out of sync
+	      for everything in the data structure.
+	*/
+
+	/* Still called directly by drbd_make_request,
+	 * so all sorts of processes may end up here.
+	 * They may be interrupted by DRBD_SIG in response to
+	 * ioctl or some other "connection lost" event.
+	 * This is not propagated.
+	 */
+
+	old_blocked = drbd_block_all_signals();
+	down(&mdev->data.mutex);
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=current;
+	spin_unlock(&mdev->send_task_lock);
+
+	if(test_and_clear_bit(ISSUE_BARRIER,&mdev->flags))
+		ok = _drbd_send_barrier(mdev);
+	if(ok) {
+		tl_add(mdev,req);
+		dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__);
+		set_bit(UNPLUG_REMOTE,&mdev->flags);
+		ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE);
+		if(ok) {
+			if(mdev->conf.wire_protocol == DRBD_PROT_A) {
+				ok = _drbd_send_bio(mdev,drbd_req_private_bio(req));
+			} else {
+				ok = _drbd_send_zc_bio(mdev,drbd_req_private_bio(req));
+			}
+		}
+		if(!ok) tl_cancel(mdev,req);
+	}
+	if (!ok) {
+		drbd_set_out_of_sync(mdev,
+				     drbd_req_get_sector(req),
+				     drbd_req_get_size(req));
+		drbd_end_req(req,RQ_DRBD_SENT,ERF_NOTLD|1,
+			     drbd_req_get_sector(req));
+	}
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=NULL;
+	spin_unlock(&mdev->send_task_lock);
+
+	up(&mdev->data.mutex);
+	restore_old_sigset(old_blocked);
+	return ok;
+}
+
+int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd,
+		    struct Tl_epoch_entry *e)
+{
+	int ok;
+	sigset_t old_blocked;
+	Drbd_Data_Packet p;
+
+	p.head.magic   = BE_DRBD_MAGIC;
+	p.head.command = cpu_to_be16(cmd);
+	p.head.length  = cpu_to_be16( sizeof(p)-sizeof(Drbd_Header)
+				     + drbd_ee_get_size(e) );
+
+	p.sector   = cpu_to_be64(drbd_ee_get_sector(e));
+	p.block_id = e->block_id;
+
+	/* Only called by our kernel thread.
+	 * This one may be interupted by DRBD_SIG and/or DRBD_SIGKILL
+	 * in response to ioctl or module unload.
+	 */
+	old_blocked = drbd_block_all_signals();
+	down(&mdev->data.mutex);
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=current;
+	spin_unlock(&mdev->send_task_lock);
+
+	dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__);
+	ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE);
+	if (ok) ok = _drbd_send_zc_bio(mdev,&e->private_bio);
+
+	spin_lock(&mdev->send_task_lock);
+	mdev->send_task=NULL;
+	spin_unlock(&mdev->send_task_lock);
+	up(&mdev->data.mutex);
+	restore_old_sigset(old_blocked);
+	return ok;
+}
+
+/*
+  drbd_send distinguishes two cases:
+
+  Packets sent via the data socket "sock"
+  and packets sent via the meta data socket "msock"
+
+		    sock                      msock
+  -----------------+-------------------------+------------------------------
+  timeout           conf.timeout / 2          conf.timeout / 2
+  timeout action    send a ping via msock     Abort communication
+					      and close all sockets
+*/
+
+/*
+ * you should have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(drbd_dev *mdev, struct socket *sock,
+	      void* buf, size_t size, unsigned msg_flags)
+{
+#if !HAVE_KERNEL_SENDMSG
+	mm_segment_t oldfs;
+	struct iovec iov;
+#else
+	struct kvec iov;
+#endif
+	struct msghdr msg;
+	int rv,sent=0;
+
+	if (!sock) return -1000;
+	if ((volatile int)mdev->cstate < WFReportParams) return -1001;
+
+	// THINK  if (signal_pending) return ... ?
+
+	iov.iov_base = buf;
+	iov.iov_len  = size;
+
+	msg.msg_name       = 0;
+	msg.msg_namelen    = 0;
+#if !HAVE_KERNEL_SENDMSG
+	msg.msg_iov        = &iov;
+	msg.msg_iovlen     = 1;
+#endif
+	msg.msg_control    = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_flags      = msg_flags | MSG_NOSIGNAL;
+
+#if !HAVE_KERNEL_SENDMSG
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+#endif
+
+	if (sock == mdev->data.socket)
+		mdev->ko_count = mdev->conf.ko_count;
+	do {
+		/* STRANGE
+		 * tcp_sendmsg does _not_ use its size parameter at all ?
+		 *
+		 * -EAGAIN on timeout, -EINTR on signal.
+		 */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+#if !HAVE_KERNEL_SENDMSG
+		rv = sock_sendmsg(sock, &msg, iov.iov_len );
+#else
+		rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+#endif
+		if (rv == -EAGAIN) {
+			if (we_should_drop_the_connection(mdev,sock))
+				break;
+			else
+				continue;
+		}
+		D_ASSERT(rv != 0);
+		if (rv == -EINTR ) {
+#if 0
+			/* FIXME this happens all the time.
+			 * we don't care for now!
+			 * eventually this should be sorted out be the proper
+			 * use of the SIGNAL_ASENDER bit... */
+			if (DRBD_ratelimit(5*HZ,5)) {
+				DBG("Got a signal in drbd_send(,%c,)!\n",
+				    sock == mdev->meta.socket ? 'm' : 's');
+				// dump_stack();
+			}
+#endif
+			drbd_flush_signals(current);
+			rv = 0;
+		}
+		if (rv < 0) break;
+		sent += rv;
+		iov.iov_base += rv;
+		iov.iov_len  -= rv;
+	} while(sent < size);
+
+#if !HAVE_KERNEL_SENDMSG
+	set_fs(oldfs);
+#endif
+
+	if (rv <= 0) {
+		if (rv != -EAGAIN) {
+			ERR("%s_sendmsg returned %d\n",
+			    sock == mdev->meta.socket ? "msock" : "sock",
+			    rv);
+			set_cstate(mdev, BrokenPipe);
+		} else
+			set_cstate(mdev, Timeout);
+		drbd_thread_restart_nowait(&mdev->receiver);
+	}
+
+	return sent;
+}
+
+STATIC int drbd_open(struct inode *inode, struct file *file)
+{
+	int minor;
+
+	minor = MINOR(inode->i_rdev);
+	if(minor >= minor_count) return -ENODEV;
+
+	if (file->f_mode & FMODE_WRITE) {
+		if( drbd_conf[minor].state == Secondary) {
+			return -EROFS;
+		}
+		set_bit(WRITER_PRESENT, &drbd_conf[minor].flags);
+	}
+
+	drbd_conf[minor].open_cnt++;
+
+	NOT_IN_26(MOD_INC_USE_COUNT;)
+
+	return 0;
+}
+
+STATIC int drbd_close(struct inode *inode, struct file *file)
+{
+	/* do not use *file (May be NULL, in case of a unmount :-) */
+	int minor;
+
+	minor = MINOR(inode->i_rdev);
+	if(minor >= minor_count) return -ENODEV;
+
+	/*
+	printk(KERN_ERR DEVICE_NAME ": close(inode=%p,file=%p)"
+	       "current=%p,minor=%d,wc=%d\n", inode, file, current, minor,
+	       inode->i_writecount);
+	*/
+
+	if (--drbd_conf[minor].open_cnt == 0) {
+		clear_bit(WRITER_PRESENT, &drbd_conf[minor].flags);
+	}
+
+	NOT_IN_26(MOD_DEC_USE_COUNT;)
+
+	return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+STATIC void drbd_unplug_fn(void *data)
+{
+	struct Drbd_Conf* mdev = (drbd_dev*)data;
+	spin_lock_irq(&mdev->req_lock);
+	if (list_empty(&mdev->unplug_work.list))
+		_drbd_queue_work_front(&mdev->data.work,&mdev->unplug_work);
+	spin_unlock_irq(&mdev->req_lock);
+}
+#else
+
+STATIC void drbd_unplug_fn(request_queue_t *q)
+{
+	drbd_dev *mdev = q->queuedata;
+
+	/* unplug FIRST */
+	spin_lock_irq(q->queue_lock);
+	blk_remove_plug(q);
+	spin_unlock_irq(q->queue_lock);
+
+	/* only if connected */
+	if (mdev->cstate >= Connected && !test_bit(PARTNER_DISKLESS,&mdev->flags)) {
+		D_ASSERT(mdev->state == Primary);
+		if (test_and_clear_bit(UNPLUG_REMOTE,&mdev->flags)) {
+			spin_lock_irq(&mdev->req_lock);
+			/* add to the front of the data.work queue,
+			 * unless already queued.
+			 * XXX this might be a good addition to drbd_queue_work
+			 * anyways, to detect "double queuing" ... */
+			if (list_empty(&mdev->unplug_work.list))
+				_drbd_queue_work_front(&mdev->data.work,&mdev->unplug_work);
+			spin_unlock_irq(&mdev->req_lock);
+		}
+	}
+
+	if(!test_bit(DISKLESS,&mdev->flags)) drbd_kick_lo(mdev);
+}
+#endif
+
+void drbd_set_defaults(drbd_dev *mdev)
+{
+	mdev->flags = 1<<DISKLESS;
+
+	mdev->sync_conf.rate       = 250;
+	mdev->sync_conf.al_extents = 127; // 512 MB active set
+	mdev->state                = Secondary;
+	mdev->o_state              = Unknown;
+	mdev->cstate               = Unconfigured;
+}
+
+void drbd_init_set_defaults(drbd_dev *mdev)
+{
+	// the memset(,0,) did most of this
+	// note: only assignments, no allocation in here
+
+#ifdef PARANOIA
+	SET_MDEV_MAGIC(mdev);
+#endif
+
+	drbd_set_defaults(mdev);
+
+	atomic_set(&mdev->ap_bio_cnt,0);
+	atomic_set(&mdev->ap_pending_cnt,0);
+	atomic_set(&mdev->rs_pending_cnt,0);
+	atomic_set(&mdev->unacked_cnt,0);
+	atomic_set(&mdev->local_cnt,0);
+	atomic_set(&mdev->resync_locked,0);
+
+	init_MUTEX(&mdev->md_io_mutex);
+	init_MUTEX(&mdev->data.mutex);
+	init_MUTEX(&mdev->meta.mutex);
+	sema_init(&mdev->data.work.s,0);
+	sema_init(&mdev->meta.work.s,0);
+
+	mdev->al_lock        = SPIN_LOCK_UNLOCKED;
+	mdev->tl_lock        = SPIN_LOCK_UNLOCKED;
+	mdev->ee_lock        = SPIN_LOCK_UNLOCKED;
+	mdev->req_lock       = SPIN_LOCK_UNLOCKED;
+	mdev->pr_lock        = SPIN_LOCK_UNLOCKED;
+	mdev->send_task_lock = SPIN_LOCK_UNLOCKED;
+
+	INIT_LIST_HEAD(&mdev->free_ee);
+	INIT_LIST_HEAD(&mdev->active_ee);
+	INIT_LIST_HEAD(&mdev->sync_ee);
+	INIT_LIST_HEAD(&mdev->done_ee);
+	INIT_LIST_HEAD(&mdev->read_ee);
+	INIT_LIST_HEAD(&mdev->net_ee);
+	INIT_LIST_HEAD(&mdev->app_reads);
+	INIT_LIST_HEAD(&mdev->resync_reads);
+	INIT_LIST_HEAD(&mdev->data.work.q);
+	INIT_LIST_HEAD(&mdev->meta.work.q);
+	INIT_LIST_HEAD(&mdev->resync_work.list);
+	INIT_LIST_HEAD(&mdev->barrier_work.list);
+	INIT_LIST_HEAD(&mdev->unplug_work.list);
+	mdev->resync_work.cb  = w_resync_inactive;
+	mdev->barrier_work.cb = w_try_send_barrier;
+	mdev->unplug_work.cb  = w_send_write_hint;
+	init_timer(&mdev->resync_timer);
+	mdev->resync_timer.function = resync_timer_fn;
+	mdev->resync_timer.data = (unsigned long) mdev;
+
+	init_waitqueue_head(&mdev->cstate_wait);
+	init_waitqueue_head(&mdev->ee_wait);
+	init_waitqueue_head(&mdev->al_wait);
+
+	drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
+	drbd_thread_init(mdev, &mdev->worker, drbd_worker);
+	drbd_thread_init(mdev, &mdev->asender, drbd_asender);
+
+NOT_IN_26(
+	mdev->write_hint_tq.routine = &drbd_unplug_fn;
+	mdev->write_hint_tq.data    = mdev;
+)
+
+#ifdef __arch_um__
+	INFO("mdev = 0x%p\n",mdev);
+#endif
+}
+
+void drbd_mdev_cleanup(drbd_dev *mdev)
+{
+	/* I'd like to cleanup completely, and memset(,0,) it.
+	 * but I'd have to reinit it.
+	 * FIXME: do the right thing...
+	 */
+
+	/* list of things that may still
+	 * hold data of the previous config
+
+	 * act_log        ** re-initialized in set_disk
+	 * on_io_error
+
+	 * al_tr_cycle    ** re-initialized in ... FIXME??
+	 * al_tr_number
+	 * al_tr_pos
+
+	 * backing_bdev   ** re-initialized in drbd_free_ll_dev
+	 * lo_file
+	 * md_bdev 
+	 * md_file
+	 * md_index
+
+	 * ko_count       ** re-initialized in set_net
+
+	 * last_received  ** currently ignored
+
+	 * mbds_id        ** re-initialized in ... FIXME??
+
+	 * resync         ** re-initialized in ... FIXME??
+
+	*** no re-init necessary (?) ***
+	 * md_io_page
+	 * this_bdev
+
+	 * vdisk             ?
+
+	 * rq_queue       ** FIXME ASSERT ??
+	 * newest_barrier
+	 * oldest_barrier
+	 */
+
+	drbd_thread_stop(&mdev->worker);
+
+	if (   mdev->ee_in_use  !=  0
+	    || mdev->ee_vacant  != 32 /* EE_MININUM */
+	    || atomic_read(&mdev->epoch_size) !=  0)
+		ERR("ee_in_use:%d ee_vacant:%d epoch_size:%d\n",
+		    mdev->ee_in_use, mdev->ee_vacant, atomic_read(&mdev->epoch_size));
+#define ZAP(x) memset(&x,0,sizeof(x))
+	ZAP(mdev->conf);
+	ZAP(mdev->sync_conf);
+	// ZAP(mdev->data); Not yet!
+	// ZAP(mdev->meta); Not yet!
+	ZAP(mdev->gen_cnt);
+#undef ZAP
+	mdev->al_writ_cnt  =
+	mdev->bm_writ_cnt  =
+	mdev->read_cnt     =
+	mdev->recv_cnt     =
+	mdev->send_cnt     =
+	mdev->writ_cnt     =
+	mdev->la_size      =
+	mdev->lo_usize     =
+	mdev->p_size       =
+	mdev->rs_start     =
+	mdev->rs_total     =
+	mdev->rs_mark_left =
+	mdev->rs_mark_time = 0;
+	mdev->send_task    = NULL;
+	drbd_set_my_capacity(mdev,0);
+	drbd_bm_resize(mdev,0);
+
+	// just in case
+	drbd_free_resources(mdev);
+
+	/*
+	 * currently we drbd_init_ee only on module load, so
+	 * we may do drbd_release_ee only on module unload!
+	 * drbd_release_ee(&mdev->free_ee);
+	 * D_ASSERT(list_emptry(&mdev->free_ee));
+	 *
+	 */
+	D_ASSERT(list_empty(&mdev->active_ee));
+	D_ASSERT(list_empty(&mdev->sync_ee));
+	D_ASSERT(list_empty(&mdev->done_ee));
+	D_ASSERT(list_empty(&mdev->read_ee));
+	D_ASSERT(list_empty(&mdev->net_ee));
+	D_ASSERT(list_empty(&mdev->app_reads));
+	D_ASSERT(list_empty(&mdev->resync_reads));
+	D_ASSERT(list_empty(&mdev->data.work.q));
+	D_ASSERT(list_empty(&mdev->meta.work.q));
+	D_ASSERT(list_empty(&mdev->resync_work.list));
+	D_ASSERT(list_empty(&mdev->barrier_work.list));
+	D_ASSERT(list_empty(&mdev->unplug_work.list));
+
+	drbd_set_defaults(mdev);
+}
+
+
+void drbd_destroy_mempools(void)
+{
+	if (drbd_request_mempool)
+		mempool_destroy(drbd_request_mempool);
+	if (drbd_ee_cache && kmem_cache_destroy(drbd_ee_cache))
+		printk(KERN_ERR DEVICE_NAME
+		       ": kmem_cache_destroy(drbd_ee_cache) FAILED\n");
+	if (drbd_request_cache && kmem_cache_destroy(drbd_request_cache))
+		printk(KERN_ERR DEVICE_NAME
+		       ": kmem_cache_destroy(drbd_request_cache) FAILED\n");
+	// FIXME what can we do if we fail to destroy them?
+
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+
+	return;
+}
+
+int drbd_create_mempools(void)
+{
+	// prepare our caches and mempools
+	drbd_request_mempool = NULL;
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+
+	// caches
+	drbd_request_cache = kmem_cache_create(
+		"drbd_req_cache", sizeof(drbd_request_t),
+		0, SLAB_NO_REAP, NULL, NULL);
+	if (drbd_request_cache == NULL)
+		goto Enomem;
+
+	drbd_ee_cache = kmem_cache_create(
+		"drbd_ee_cache", sizeof(struct Tl_epoch_entry),
+		0, SLAB_NO_REAP, NULL, NULL);
+	if (drbd_ee_cache == NULL)
+		goto Enomem;
+
+	// mempools
+	drbd_request_mempool = mempool_create(16, //TODO; reasonable value
+		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
+	if (drbd_request_mempool == NULL)
+		goto Enomem;
+
+		return 0;
+
+  Enomem:
+	drbd_destroy_mempools(); // in case we allocated some
+	return -ENOMEM;
+}
+
+static void __exit drbd_cleanup(void)
+{
+	int i, rr;
+
+	if (drbd_conf) {
+		for (i = 0; i < minor_count; i++) {
+			drbd_dev    *mdev = drbd_conf + i;
+
+			if (mdev) {
+				down(&mdev->device_mutex);
+				drbd_set_state(mdev,Secondary);
+				up(&mdev->device_mutex);
+				drbd_sync_me(mdev);
+				set_bit(DO_NOT_INC_CONCNT,&mdev->flags);
+				drbd_thread_stop(&mdev->receiver);
+				drbd_thread_stop(&mdev->worker);
+			}
+		}
+
+		if (drbd_proc)
+			remove_proc_entry("drbd",&proc_root);
+		i=minor_count;
+		while (i--) {
+			drbd_dev        *mdev  = drbd_conf+i;
+ONLY_IN_26(
+			struct gendisk  **disk = &mdev->vdisk;
+			request_queue_t **q    = &mdev->rq_queue;
+)
+
+			drbd_free_resources(mdev);
+
+ONLY_IN_26(
+			if (*disk) {
+				del_gendisk(*disk);
+				put_disk(*disk);
+				*disk = NULL;
+			}
+			if (*q) blk_put_queue(*q);
+			*q = NULL;
+
+			if (mdev->this_bdev->bd_holder == drbd_sec_holder) { 
+				mdev->this_bdev->bd_contains = mdev->this_bdev;
+				bd_release(mdev->this_bdev);
+			}
+			if (mdev->this_bdev) bdput(mdev->this_bdev);
+)
+
+			tl_cleanup(mdev);
+			if (mdev->bitmap) drbd_bm_cleanup(mdev);
+			if (mdev->resync) lc_free(mdev->resync);
+
+			D_ASSERT(mdev->ee_in_use==0);
+
+			rr = drbd_release_ee(mdev,&mdev->free_ee);
+			// INFO("%d EEs in free list found.\n",rr);
+			// D_ASSERT(rr == 32);
+
+			rr = drbd_release_ee(mdev,&mdev->active_ee);
+			if(rr) ERR("%d EEs in active list found!\n",rr);
+
+			rr = drbd_release_ee(mdev,&mdev->sync_ee);
+			if(rr) ERR("%d EEs in sync list found!\n",rr);
+
+			rr = drbd_release_ee(mdev,&mdev->read_ee);
+			if(rr) ERR("%d EEs in read list found!\n",rr);
+
+			rr = drbd_release_ee(mdev,&mdev->done_ee);
+			if(rr) ERR("%d EEs in done list found!\n",rr);
+
+			rr = drbd_release_ee(mdev,&mdev->net_ee);
+			if(rr) ERR("%d EEs in net list found!\n",rr);
+
+			ERR_IF (!list_empty(&mdev->data.work.q)) {
+				struct list_head *lp;
+				list_for_each(lp,&mdev->data.work.q) {
+					DUMPP(lp);
+				}
+			};
+			D_ASSERT(mdev->ee_vacant == 0);
+
+			if (mdev->md_io_page)
+				__free_page(mdev->md_io_page);
+
+			if (mdev->md_io_tmpp)
+				__free_page(mdev->md_io_tmpp);
+
+			if (mdev->act_log) lc_free(mdev->act_log);
+		}
+		drbd_destroy_mempools();
+	}
+
+#ifndef HAVE_COMPAT_IOCTL_MEMBER
+#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) || defined(CONFIG_X86_64)
+	lock_kernel();
+	unregister_ioctl32_conversion(DRBD_IOCTL_GET_VERSION);
+	unregister_ioctl32_conversion(DRBD_IOCTL_SET_STATE);
+	unregister_ioctl32_conversion(DRBD_IOCTL_SET_DISK_CONFIG);
+	unregister_ioctl32_conversion(DRBD_IOCTL_SET_NET_CONFIG);
+	unregister_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_NET);
+	unregister_ioctl32_conversion(DRBD_IOCTL_GET_CONFIG);
+	unregister_ioctl32_conversion(DRBD_IOCTL_INVALIDATE);
+	unregister_ioctl32_conversion(DRBD_IOCTL_INVALIDATE_REM);
+	unregister_ioctl32_conversion(DRBD_IOCTL_SET_SYNC_CONFIG);
+	unregister_ioctl32_conversion(DRBD_IOCTL_SET_DISK_SIZE);
+	unregister_ioctl32_conversion(DRBD_IOCTL_WAIT_CONNECT);
+	unregister_ioctl32_conversion(DRBD_IOCTL_WAIT_SYNC);
+	unregister_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_DISK);
+	unlock_kernel();
+#endif
+#endif
+
+NOT_IN_26(
+	blksize_size[MAJOR_NR] = NULL;
+	blk_size[MAJOR_NR]     = NULL;
+	// kfree(NULL) is noop
+	kfree(drbd_blocksizes);
+	kfree(drbd_sizes);
+)
+	kfree(drbd_conf);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+	devfs_unregister(devfs_handle);
+#else
+	devfs_remove(drbd_devfs_name);
+#endif
+
+	if (unregister_blkdev(MAJOR_NR, DEVICE_NAME) != 0)
+		printk(KERN_ERR DEVICE_NAME": unregister of device failed\n");
+
+	printk(KERN_INFO DEVICE_NAME": module cleanup done.\n");
+}
+
+int sizeof_drbd_structs_sanity_check(void);
+int __init drbd_init(void)
+{
+	int i,err;
+
+#if 0
+#warning "DEBUGGING"
+/* I am too lazy to calculate this by hand	-lge
+ */
+#define SZO(x) printk(KERN_ERR "sizeof(" #x ") = %d\n", sizeof(x))
+	SZO(struct Drbd_Conf);
+	SZO(struct buffer_head);
+	SZO(Drbd_Polymorph_Packet);
+	SZO(struct drbd_socket);
+	SZO(struct bm_extent);
+	SZO(struct lc_element);
+	SZO(struct semaphore);
+	SZO(struct drbd_request);
+	SZO(struct bio);
+	SZO(wait_queue_head_t);
+	SZO(spinlock_t);
+	SZO(Drbd_Header);
+	SZO(Drbd_HandShake_Packet);
+	SZO(Drbd_Barrier_Packet);
+	SZO(Drbd_BarrierAck_Packet);
+	SZO(Drbd_SyncParam_Packet);
+	SZO(Drbd_Parameter_Packet);
+	SZO(Drbd06_Parameter_P);
+	SZO(Drbd_Data_Packet);
+	SZO(Drbd_BlockAck_Packet);
+	printk(KERN_ERR "AL_EXTENTS_PT = %d\n",AL_EXTENTS_PT);
+	printk(KERN_ERR "DRBD_MAX_SECTORS = %llu\n",DRBD_MAX_SECTORS);
+	return -EBUSY;
+#endif
+
+	if (sizeof(Drbd_HandShake_Packet) != 80) {
+		printk(KERN_ERR DEVICE_NAME
+		       ": never change the size or layout of the HandShake packet.\n");
+		return -EINVAL;
+	}
+	if (sizeof_drbd_structs_sanity_check()) {
+		return -EINVAL;
+	}
+
+	if (use_nbd_major) {
+		major_nr = NBD_MAJOR;
+	}
+
+	if (1 > minor_count||minor_count > 255) {
+		printk(KERN_ERR DEVICE_NAME
+			": invalid minor_count (%d)\n",minor_count);
+#ifdef MODULE
+		return -EINVAL;
+#else
+		minor_count = 8;
+#endif
+	}
+
+	err = register_blkdev(MAJOR_NR, DEVICE_NAME
+			      NOT_IN_26(, &drbd_ops)
+			      );
+	if (err) {
+		printk(KERN_ERR DEVICE_NAME
+		       ": unable to register block device major %d\n",
+		       MAJOR_NR);
+		return err;
+	}
+
+	drbd_devfs_name = (major_nr == NBD_MAJOR) ? "nbd" : "drbd";
+
+	/*
+	 * allocate all necessary structs
+	 */
+	err = -ENOMEM;
+
+	drbd_proc = NULL; // play safe for drbd_cleanup
+	drbd_conf = kmalloc(sizeof(drbd_dev)*minor_count,GFP_KERNEL);
+	if (likely(drbd_conf!=NULL))
+		memset(drbd_conf,0,sizeof(drbd_dev)*minor_count);
+	else goto Enomem;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+	drbd_sizes = kmalloc(sizeof(int)*minor_count,GFP_KERNEL);
+	if (likely(drbd_sizes!=NULL))
+		memset(drbd_sizes,0,sizeof(int)*minor_count);
+	else goto Enomem;
+	drbd_blocksizes = kmalloc(sizeof(int)*minor_count,GFP_KERNEL);
+	if (unlikely(!drbd_blocksizes)) goto Enomem;
+#else
+
+	devfs_mk_dir(drbd_devfs_name);
+
+	for (i = 0; i < minor_count; i++) {
+		drbd_dev    *mdev = drbd_conf + i;
+		struct gendisk         *disk;
+		request_queue_t        *q;
+
+		q = blk_alloc_queue(GFP_KERNEL);
+		if (!q) goto Enomem;
+		mdev->rq_queue = q;
+		q->queuedata   = mdev;
+
+		disk = alloc_disk(1);
+		if (!disk) goto Enomem;
+		mdev->vdisk = disk;
+
+		set_disk_ro( disk, TRUE );
+
+		disk->queue = q;
+		disk->major = MAJOR_NR;
+		disk->first_minor = i;
+		disk->fops = &drbd_ops;
+		sprintf(disk->disk_name, DEVICE_NAME "%d", i);
+		sprintf(disk->devfs_name, "%s/%d", drbd_devfs_name, i);
+		disk->private_data = mdev;
+		add_disk(disk);
+
+		mdev->this_bdev = bdget(MKDEV(MAJOR_NR,i));
+		// we have no partitions. we contain only ourselves.
+		mdev->this_bdev->bd_contains = mdev->this_bdev;
+		if (bd_claim(mdev->this_bdev,drbd_sec_holder)) {
+			// Initial we are Secondary -> should claim myself.
+			WARN("Could not bd_claim() myself.");
+		} else if (disable_bd_claim) {
+			bd_release(mdev->this_bdev);
+		}
+
+		blk_queue_make_request(q,drbd_make_request_26);
+		q->queue_lock = &mdev->req_lock; // needed since we use
+		// plugging on a queue, that actually has no requests!
+		q->unplug_fn = drbd_unplug_fn;
+	}
+#endif
+
+	if ((err = drbd_create_mempools()))
+		goto Enomem;
+
+	for (i = 0; i < minor_count; i++) {
+		drbd_dev    *mdev = &drbd_conf[i];
+		struct page *page = alloc_page(GFP_KERNEL);
+
+		drbd_init_set_defaults(mdev);
+
+NOT_IN_26(
+		drbd_blocksizes[i] = INITIAL_BLOCK_SIZE;
+		mdev->this_bdev = MKDEV(MAJOR_NR, i);
+		set_device_ro( MKDEV(MAJOR_NR, i), TRUE );
+)
+
+		if(!page) goto Enomem;
+		mdev->md_io_page = page;
+
+		if (drbd_bm_init(mdev)) goto Enomem;
+		// no need to lock access, we are still initializing the module.
+		mdev->resync = lc_alloc(17, sizeof(struct bm_extent),mdev);
+		if (!mdev->resync) goto Enomem;
+		mdev->act_log = lc_alloc(mdev->sync_conf.al_extents,
+					 sizeof(struct lc_element), mdev);
+		if (!mdev->act_log) goto Enomem;
+
+		init_MUTEX(&mdev->device_mutex);
+		if (!tl_init(mdev)) goto Enomem;
+		if (!drbd_init_ee(mdev)) goto Enomem;
+	}
+
+#if CONFIG_PROC_FS
+	/*
+	 * register with procfs
+	 */
+	drbd_proc = create_proc_entry("drbd",  S_IFREG | S_IRUGO , &proc_root);
+
+	if (!drbd_proc)	{
+		printk(KERN_ERR DEVICE_NAME": unable to register proc file\n");
+		goto Enomem;
+	}
+	
+	drbd_proc->proc_fops = &drbd_proc_fops;
+	drbd_proc->owner = THIS_MODULE;
+#else
+# error "Currently drbd depends on the proc file system (CONFIG_PROC_FS)"
+#endif
+NOT_IN_26(
+	blksize_size[MAJOR_NR] = drbd_blocksizes;
+	blk_size[MAJOR_NR] = drbd_sizes;
+)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+	devfs_handle = devfs_mk_dir (NULL, drbd_devfs_name, NULL);
+	devfs_register_series(devfs_handle, "%u", minor_count,
+			      DEVFS_FL_DEFAULT, MAJOR_NR, 0,
+			      S_IFBLK | S_IRUSR | S_IWUSR,
+			      &drbd_ops, NULL);
+#endif
+
+	NOT_IN_26(blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR),drbd_make_request_24);)
+
+#ifndef HAVE_COMPAT_IOCTL_MEMBER
+#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) || defined(CONFIG_X86_64)
+	// tell the kernel that we think our ioctls are 64bit clean
+	lock_kernel();
+	register_ioctl32_conversion(DRBD_IOCTL_GET_VERSION,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_SET_STATE,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_SET_DISK_CONFIG,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_SET_NET_CONFIG,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_NET,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_GET_CONFIG,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_INVALIDATE,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_INVALIDATE_REM,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_SET_SYNC_CONFIG,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_SET_DISK_SIZE,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_WAIT_CONNECT,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_WAIT_SYNC,NULL);
+	register_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_DISK,NULL);
+	unlock_kernel();
+#endif
+#endif
+
+	printk(KERN_INFO DEVICE_NAME ": initialised. "
+	       "Version: " REL_VERSION " (api:%d/proto:%d)\n",
+	       API_VERSION,PRO_VERSION);
+	printk(KERN_INFO DEVICE_NAME ": %s\n", drbd_buildtag());
+	if (use_nbd_major) {
+		printk(KERN_INFO DEVICE_NAME": hijacking NBD device major!\n");
+	}
+	printk(KERN_INFO DEVICE_NAME": registered as block device major %d\n", MAJOR_NR);
+
+	return 0; // Success!
+
+  Enomem:
+	drbd_cleanup();
+	if (err == -ENOMEM) // currently always the case
+		printk(KERN_ERR DEVICE_NAME ": ran out of memory\n");
+	else
+		printk(KERN_ERR DEVICE_NAME ": initialization failure\n");
+	return err;
+}
+
+void drbd_free_ll_dev(drbd_dev *mdev)
+{
+	struct file *lo_file;
+
+	lo_file = mdev->lo_file;
+	mdev->lo_file = 0;
+	wmb();
+
+	if (lo_file) {
+NOT_IN_26(
+		blkdev_put(lo_file->f_dentry->d_inode->i_bdev,BDEV_FILE);
+		blkdev_put(mdev->md_file->f_dentry->d_inode->i_bdev,BDEV_FILE);
+)
+ONLY_IN_26(
+		bd_release(mdev->backing_bdev);
+		bd_release(mdev->md_bdev);
+)
+		mdev->md_bdev =
+		mdev->backing_bdev = 0;
+
+		fput(lo_file);
+		fput(mdev->md_file);
+		// mdev->lo_file = 0;
+		mdev->md_file = 0;
+	}
+}
+
+void drbd_free_sock(drbd_dev *mdev)
+{
+	if (mdev->data.socket) {
+		sock_release(mdev->data.socket);
+		mdev->data.socket = 0;
+	}
+	if (mdev->meta.socket) {
+		sock_release(mdev->meta.socket);
+		mdev->meta.socket = 0;
+	}
+}
+
+
+void drbd_free_resources(drbd_dev *mdev)
+{
+	drbd_free_sock(mdev);
+	drbd_free_ll_dev(mdev);
+}
+
+/*********************************/
+/* meta data management */
+
+struct meta_data_on_disk {
+	u64 la_size;           // last agreed size.
+	u32 gc[GEN_CNT_SIZE];  // generation counter
+	u32 magic;
+	u32 md_size;
+	u32 al_offset;         // offset to this block
+	u32 al_nr_extents;     // important for restoring the AL
+	u32 bm_offset;         // offset to the bitmap, from here
+} __attribute((packed));
+
+/*
+
+FIXME md_io might fail unnoticed sometimes ...
+
+*/
+void drbd_md_write(drbd_dev *mdev)
+{
+	struct meta_data_on_disk * buffer;
+	u32 flags;
+	sector_t sector;
+	int i;
+
+	ERR_IF(!inc_local_md_only(mdev)) return;
+
+	down(&mdev->md_io_mutex);
+	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	memset(buffer,0,512);
+
+	flags = mdev->gen_cnt[Flags] & ~(MDF_PrimaryInd|MDF_ConnectedInd);
+	if (mdev->state  == Primary)        flags |= MDF_PrimaryInd;
+	if (mdev->cstate >= WFReportParams) flags |= MDF_ConnectedInd;
+	mdev->gen_cnt[Flags] = flags;
+
+	for (i = Flags; i < GEN_CNT_SIZE; i++)
+		buffer->gc[i]=cpu_to_be32(mdev->gen_cnt[i]);
+	buffer->la_size=cpu_to_be64(drbd_get_capacity(mdev->this_bdev)>>1);
+	buffer->magic=cpu_to_be32(DRBD_MD_MAGIC);
+
+	buffer->md_size = __constant_cpu_to_be32(MD_RESERVED_SIZE);
+	buffer->al_offset = __constant_cpu_to_be32(MD_AL_OFFSET);
+	buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
+
+	buffer->bm_offset = __constant_cpu_to_be32(MD_BM_OFFSET);
+
+	sector = drbd_md_ss(mdev) + MD_GC_OFFSET;
+
+#if 0
+	/* FIXME sooner or later I'd like to use the MD_DIRTY flag everywhere,
+	 * so we can avoid unneccessary md writes.
+	 */
+	ERR_IF (!test_bit(MD_DIRTY,&mdev->flags)) {
+		dump_stack();
+	}
+#endif
+
+	if (drbd_md_sync_page_io(mdev,sector,WRITE)) {
+		clear_bit(MD_DIRTY,&mdev->flags);
+	} else {
+		if (test_bit(DISKLESS,&mdev->flags)) {
+			/* this was a try anyways ... */
+			ERR("meta data update failed!\n");
+		} else {
+			/* If we cannot write our meta data,
+			 * but we are supposed to be able to,
+			 * tough!
+			 */
+			drbd_panic("meta data update failed!\n");
+		}
+	}
+
+	// why is this here?? please EXPLAIN.
+	mdev->la_size = drbd_get_capacity(mdev->this_bdev)>>1;
+
+	up(&mdev->md_io_mutex);
+	dec_local(mdev);
+}
+
+/*
+ * return:
+ *   < 0 if we had an error (currently never ...)
+ *   = 0 if we need a FullSync because either the flag is set,
+ *       or the gen counts are invalid
+ *   > 0 if we could read valid gen counts,
+ *       and reading the bitmap and act log does make sense.
+ */
+int drbd_md_read(drbd_dev *mdev)
+{
+	struct meta_data_on_disk * buffer;
+	sector_t sector;
+	int i;
+
+	if(!inc_local_md_only(mdev)) return -1;
+
+	down(&mdev->md_io_mutex);
+	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+
+	sector = drbd_md_ss(mdev) + MD_GC_OFFSET;
+
+/* FIXME different failure cases: IO error or invalid magic */
+
+	ERR_IF( ! drbd_md_sync_page_io(mdev,sector,READ) ) goto err;
+
+	if(be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) goto err;
+
+	for(i=Flags;i<=ArbitraryCnt;i++)
+		mdev->gen_cnt[i]=be32_to_cpu(buffer->gc[i]);
+	mdev->la_size = be64_to_cpu(buffer->la_size);
+	mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
+	if (mdev->sync_conf.al_extents < 7)
+		mdev->sync_conf.al_extents = 127;
+
+	up(&mdev->md_io_mutex);
+	dec_local(mdev);
+
+	return !drbd_md_test_flag(mdev,MDF_FullSync);
+
+ err:
+	up(&mdev->md_io_mutex);
+	dec_local(mdev);
+
+	INFO("Creating state block\n");
+
+	/* if we need to create a state block, we are
+	 * not consistent, and need a sync of the full device!
+	 * if one knows what he is doing, he can manipulate gcs by hand,
+	 * and avoid the initial full sync...
+	 * otherwise, one of us will have to be forced (--do-what-I-say)
+	 * to be primary, before anything is usable.
+	 */
+	set_bit(MD_DIRTY,&mdev->flags);
+	mdev->gen_cnt[Flags] = MDF_FullSync;
+	for(i = HumanCnt; i < GEN_CNT_SIZE; i++) mdev->gen_cnt[i]=1;
+
+/* FIXME might have IO errors! */
+	drbd_md_write(mdev);
+
+	return 0;
+}
+
+#if DUMP_MD >= 1
+#define MeGC(x) mdev->gen_cnt[x]
+#define PeGC(x) be32_to_cpu(peer->gen_cnt[x])
+
+void drbd_dump_md(drbd_dev *mdev, Drbd_Parameter_Packet *peer, int verbose)
+{
+	INFO("I am(%c): %c:%08x:%08x:%08x:%08x:%c%c\n",
+		mdev->state == Primary ? 'P':'S',
+		MeGC(Flags) & MDF_Consistent ? '1' : '0',
+		MeGC(HumanCnt),
+		MeGC(TimeoutCnt),
+		MeGC(ConnectedCnt),
+		MeGC(ArbitraryCnt),
+		MeGC(Flags) & MDF_PrimaryInd   ? '1' : '0',
+		MeGC(Flags) & MDF_ConnectedInd ? '1' : '0');
+	if (peer) {
+		INFO("Peer(%c): %c:%08x:%08x:%08x:%08x:%c%c\n",
+			be32_to_cpu(peer->state) == Primary ? 'P':'S',
+			PeGC(Flags) & MDF_Consistent ? '1' : '0',
+			PeGC(HumanCnt),
+			PeGC(TimeoutCnt),
+			PeGC(ConnectedCnt),
+			PeGC(ArbitraryCnt),
+			PeGC(Flags) & MDF_PrimaryInd   ? '1' : '0',
+			PeGC(Flags) & MDF_ConnectedInd ? '1' : '0');
+	} else {
+		INFO("Peer Unknown.\n");
+	}
+	if (verbose) {
+		/* TODO
+		 * dump activity log and bitmap summary,
+		 * and maybe other statistics
+		 */
+	}
+}
+
+#undef MeGC
+#undef PeGC
+#else
+void drbd_dump_md(drbd_dev *mdev, Drbd_Parameter_Packet *peer, int verbose)
+{ /* do nothing */ }
+#endif
+
+//  Returns  1 if I have the good bits,
+//           0 if both are nice
+//          -1 if the partner has the good bits.
+int drbd_md_compare(drbd_dev *mdev,Drbd_Parameter_Packet *partner)
+{
+	int i;
+	u32 me,other;
+
+	/* FIXME
+	 * we should not only rely on the consistent bit, but at least check
+	 * whether the rest of the gencounts is plausible, to detect a previous
+	 * split brain situation, and refuse anything until we are told
+	 * otherwise!
+	 *
+	 * And we should refuse to become SyncSource if we are not consistent!
+	 *
+	 * though DRBD is not to blame for it,
+	 * someone eventually will try to blame it ...
+	 */
+
+	me=mdev->gen_cnt[Flags] & MDF_Consistent;
+	other=be32_to_cpu(partner->gen_cnt[Flags]) & MDF_Consistent;
+	if( me > other ) return 1;
+	if( me < other ) return -1;
+
+	for(i=HumanCnt;i<=ArbitraryCnt;i++) {
+		me=mdev->gen_cnt[i];
+		other=be32_to_cpu(partner->gen_cnt[i]);
+		if( me > other ) return 1;
+		if( me < other ) return -1;
+	}
+
+	me=mdev->gen_cnt[Flags] & MDF_PrimaryInd;
+	other=be32_to_cpu(partner->gen_cnt[Flags]) & MDF_PrimaryInd;
+	if( me > other ) return 1;
+	if( me < other ) return -1;
+
+	return 0;
+}
+
+/* THINK do these have to be protected by some lock ? */
+void drbd_md_inc(drbd_dev *mdev, enum MetaDataIndex order)
+{
+	set_bit(MD_DIRTY,&mdev->flags);
+	mdev->gen_cnt[order]++;
+}
+void drbd_md_set_flag(drbd_dev *mdev, int flag)
+{
+	if ( (mdev->gen_cnt[Flags] & flag) != flag) {
+		set_bit(MD_DIRTY,&mdev->flags);
+		mdev->gen_cnt[Flags] |= flag;
+	}
+}
+void drbd_md_clear_flag(drbd_dev *mdev, int flag)
+{
+	if ( (mdev->gen_cnt[Flags] & flag) != 0 ) {
+		set_bit(MD_DIRTY,&mdev->flags);
+		mdev->gen_cnt[Flags] &= ~flag;
+	}
+}
+int drbd_md_test_flag(drbd_dev *mdev, int flag)
+{
+	return ((mdev->gen_cnt[Flags] & flag) != 0);
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_proc.c	2005-09-01 11:40:30.000000000 +0400
@@ -0,0 +1,294 @@
+/*
+-*- linux-c -*-
+   drbd_proc.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 1999-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+	main author.
+
+   Copyright (C) 2002-2004, Lars Ellenberg <l.g.e@web.de>.
+	main contributor.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+STATIC int drbd_proc_open(struct inode *inode, struct file *file);
+STATIC int drbd_seq_show(struct seq_file *seq, void *v);
+
+
+struct proc_dir_entry *drbd_proc;
+struct file_operations drbd_proc_fops = {
+	.owner		= THIS_MODULE,
+	.open		= drbd_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+// We ommit single_open and single_release, since that is only available
+// after 2.4.23
+static void *single_start(struct seq_file *p, loff_t *pos)
+{
+	return NULL + (*pos == 0);
+}
+
+static void *single_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	return NULL;
+}
+
+static void single_stop(struct seq_file *p, void *v)
+{
+}
+
+struct seq_operations drbd_proc_seq_ops = {
+	.start		= single_start,
+	.next		= single_next,
+	.stop		= single_stop,
+	.show		= drbd_seq_show,
+};
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ *	[=====>..............] 33.5% (23456/123456)
+ *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+STATIC void drbd_syncer_progress(struct Drbd_Conf* mdev, struct seq_file *seq)
+{
+	unsigned long res , db, dt, dbdt, rt, rs_left;
+
+	/* the whole sector_div thingy was wrong (did overflow,
+	 * did not use correctly typed parameters), and is not even
+	 * neccessary as long as rs_total and drbd_bm_total_weight
+	 * are both unsigned long.
+	 *
+	 * this is to break it at compile time when we change that
+	 * (we may feel 4TB maximum storage per drbd is not enough)
+	 */
+	typecheck(unsigned long, mdev->rs_total);
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	rs_left = drbd_bm_total_weight(mdev);
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (rs_left > mdev->rs_total) {
+		/* doh. logic bug somewhere.
+		 * for now, just try to prevent in-kernel buffer overflow.
+		 */
+		ERR("logic bug? rs_left=%lu > rs_total=%lu\n",
+				rs_left, mdev->rs_total);
+		res = 1000;
+	} else {
+		res = (rs_left >> 10)*1000/((mdev->rs_total >> 10) + 1);
+	}
+	{
+		int i, y = res/50, x = 20-y;
+		seq_printf(seq, "\t[");
+		for (i = 1; i < x; i++)
+			seq_printf(seq, "=");
+		seq_printf(seq, ">");
+		for (i = 0; i < y; i++)
+			seq_printf(seq, ".");
+		seq_printf(seq, "] ");
+	}
+	res = 1000L - res;
+	seq_printf(seq,"sync'ed:%3lu.%lu%% ", res / 10, res % 10);
+	/* if more than 1 GB display in MB */
+	if (mdev->rs_total > 0x100000L) {
+		seq_printf(seq,"(%lu/%lu)M\n\t",
+			    (unsigned long) Bit2KB(rs_left) >> 10,
+			    (unsigned long) Bit2KB(mdev->rs_total) >> 10 );
+	} else {
+		seq_printf(seq,"(%lu/%lu)K\n\t",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(mdev->rs_total) );
+	}
+
+	/* see drivers/md/md.c
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	dt = (jiffies - mdev->rs_mark_time) / HZ;
+
+	if (dt > 20) {
+		/* if we made no update to rs_mark_time for too long,
+		 * we are stalled. show that. */
+		seq_printf(seq, "stalled\n");
+		return;
+	}
+
+	if (!dt) dt++;
+	db = mdev->rs_mark_left - rs_left;
+	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+	seq_printf(seq, "finish: %lu:%02lu:%02lu",
+		rt / 3600, (rt % 3600) / 60, rt % 60);
+
+	/* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
+	dbdt = Bit2KB(db/dt);
+	if (dbdt > 1000)
+		seq_printf(seq, " speed: %ld,%03ld",
+			dbdt/1000,dbdt % 1000);
+	else
+		seq_printf(seq, " speed: %ld", dbdt);
+
+	/* mean speed since syncer started
+	 * we do account for PausedSync periods */
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt <= 0) dt=1;
+	db = mdev->rs_total - rs_left;
+	dbdt = Bit2KB(db/dt);
+	if (dbdt > 1000)
+		seq_printf(seq, " (%ld,%03ld)",
+			dbdt/1000,dbdt % 1000);
+	else
+		seq_printf(seq, " (%ld)", dbdt);
+
+	seq_printf(seq," K/sec\n");
+}
+
+const char* cstate_to_name(Drbd_CState s) {
+	static const char *cstate_names[] = {
+		[Unconfigured]   = "Unconfigured",
+		[StandAlone]     = "StandAlone",
+		[Unconnected]    = "Unconnected",
+		[Timeout]        = "Timeout",
+		[BrokenPipe]     = "BrokenPipe",
+		[NetworkFailure] = "NetworkFailure",
+		[WFConnection]   = "WFConnection",
+		[WFReportParams] = "WFReportParams",
+		[Connected]      = "Connected",
+		[SkippedSyncS]   = "SkippedSyncS",
+		[SkippedSyncT]   = "SkippedSyncT",
+		[WFBitMapS]      = "WFBitMapS",
+		[WFBitMapT]      = "WFBitMapT",
+		[SyncSource]     = "SyncSource",
+		[SyncTarget]     = "SyncTarget",
+		[PausedSyncS]    = "PausedSyncS",
+		[PausedSyncT]    = "PausedSyncT",
+	};
+
+	return s < Unconfigured ? "TO_SMALL" :
+	       s > PausedSyncT  ? "TO_LARGE"
+		                : cstate_names[s];
+}
+
+const char* nodestate_to_name(Drbd_State s) {
+	static const char *state_names[] = {
+		[Primary]   = "Primary",
+		[Secondary] = "Secondary",
+		[Unknown]   = "Unknown"
+	};
+
+	return s < Unknown    ? "TO_SMALL" :
+	       s > Secondary  ? "TO_LARGE"
+		              : state_names[s];
+}
+
+
+STATIC int drbd_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+	const char *sn;
+
+	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d)\n%s\n",
+		    API_VERSION,PRO_VERSION, drbd_buildtag());
+
+	/*
+	  cs .. connection state
+	  st .. node state (local/remote)
+	  ld .. local data consistentency
+	  ns .. network send
+	  nr .. network receive
+	  dw .. disk write
+	  dr .. disk read
+	  pe .. pending (waiting for ack)
+	  ua .. unack'd (still need to send ack)
+	  al .. access log write count
+	*/
+
+	for (i = 0; i < minor_count; i++) {
+		sn = cstate_to_name(drbd_conf[i].cstate);
+		if(drbd_conf[i].cstate == Connected) {
+			if(test_bit(DISKLESS,&drbd_conf[i].flags))
+				sn = "DiskLessClient";
+			if(test_bit(PARTNER_DISKLESS,&drbd_conf[i].flags))
+				sn = "ServerForDLess";
+		}
+		if ( drbd_conf[i].cstate == Unconfigured )
+			seq_printf( seq, "%2d: cs:Unconfigured\n", i);
+		else
+			seq_printf( seq,
+			   "%2d: cs:%s st:%s/%s ld:%s\n"
+			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+			   "lo:%d pe:%d ua:%d ap:%d\n",
+			   i, sn,
+			   nodestate_to_name(drbd_conf[i].state),
+			   nodestate_to_name(drbd_conf[i].o_state),
+			   (drbd_conf[i].gen_cnt[Flags]
+			    & MDF_Consistent) ? "Consistent" : "Inconsistent",
+			// FIXME partner consistent?
+			   drbd_conf[i].send_cnt/2,
+			   drbd_conf[i].recv_cnt/2,
+			   drbd_conf[i].writ_cnt/2,
+			   drbd_conf[i].read_cnt/2,
+			   drbd_conf[i].al_writ_cnt,
+			   drbd_conf[i].bm_writ_cnt,
+			   atomic_read(&drbd_conf[i].local_cnt),
+			   atomic_read(&drbd_conf[i].ap_pending_cnt) +
+			   atomic_read(&drbd_conf[i].rs_pending_cnt),
+			   atomic_read(&drbd_conf[i].unacked_cnt),
+			   atomic_read(&drbd_conf[i].ap_bio_cnt)
+			);
+
+		if ( drbd_conf[i].cstate == SyncSource ||
+		     drbd_conf[i].cstate == SyncTarget )
+			drbd_syncer_progress(drbd_conf+i,seq);
+	}
+
+	return 0;
+}
+
+STATIC int drbd_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &drbd_proc_seq_ops);
+}
+
+/* PROC FS stuff end */
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_receiver.c	2006-02-09 15:39:21.000000000 +0300
@@ -0,0 +1,2380 @@
+/*
+-*- linux-c -*-
+   drbd_receiver.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 1999-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+	main author.
+
+   Copyright (C) 2002-2004, Lars Ellenberg <l.g.e@web.de>.
+	main contributor.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/tcp.h>
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/drbd_config.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) || defined(HAVE_MM_INLINE_H)
+#include <linux/mm_inline.h>
+#endif
+#include <linux/slab.h>
+#include <linux/smp_lock.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+#define EE_MININUM 32    // @4k pages => 128 KByte
+
+#define is_syncer_blk(A,B) ((B)==ID_SYNCER)
+
+#ifdef __arch_um__
+void *to_virt(unsigned long phys)
+{
+	return((void *) uml_physmem + phys);
+}
+#endif
+
+#ifdef DBG_ASSERTS
+void drbd_assert_breakpoint(drbd_dev *mdev, char *exp,
+			    char *file, int line)
+{
+	ERR("ASSERT( %s ) in %s:%d\n", exp, file, line);
+}
+#endif
+
+
+#if 0
+#define CHECK_LIST_LIMIT 1000
+void check_list(drbd_dev *mdev,struct list_head *list,char *t)
+{
+	struct list_head *le,*la;
+	int forward=0,backward=0;
+
+	le=list;
+	do {
+		la=le;
+		le=le->next;
+		if( le->prev != la ) {
+			printk(KERN_ERR DEVICE_NAME
+			       "%d: %s list fucked.\n",
+			       (int)(mdev-drbd_conf),t);
+			break;
+		}
+		if( forward++ > CHECK_LIST_LIMIT ) {
+			printk(KERN_ERR DEVICE_NAME
+			       "%d: %s forward > 1000\n",
+			       (int)(mdev-drbd_conf),t);
+			break;
+		}
+	} while(le != list);
+
+	le=list;
+	do {
+		la=le;
+		le=le->prev;
+		if( le->next != la ) {
+			printk(KERN_ERR DEVICE_NAME
+			       "%d: %s list fucked.\n",
+			       (int)(mdev-drbd_conf),t);
+			break;
+		}
+		if( backward++ > CHECK_LIST_LIMIT ) {
+			printk(KERN_ERR DEVICE_NAME
+			       "%d: %s backward > 1000\n",
+			       (int)(mdev-drbd_conf),t);
+			break;
+		}
+	} while(le != list);
+
+	if(forward != backward) {
+		printk(KERN_ERR DEVICE_NAME "%d: forward=%d, backward=%d\n",
+		       (int)(mdev-drbd_conf),forward,backward);
+	}
+}
+#endif
+
+#if 0
+STATIC inline int is_syncer_blk(drbd_dev *mdev, u64 block_id)
+{
+	if ( block_id == ID_SYNCER ) return 1;
+	/* Use this code if you are working with a VIA based mboard :) */
+	if ( (long)block_id == (long)-1) {
+		printk(KERN_ERR DEVICE_NAME
+		       "%d: strange block_id %lx%lx\n",(int)(mdev-drbd_conf),
+		       (unsigned long)(block_id>>32),
+		       (unsigned long)block_id);
+		return 1;
+	}
+	return 0;
+}
+#endif //PARANOIA
+
+/*
+You need to hold the ee_lock:
+ drbd_free_ee()
+ drbd_get_ee()
+ drbd_put_ee()
+ _drbd_process_ee()
+
+You must not have the ee_lock:
+ _drbd_alloc_ee()
+ drbd_alloc_ee()
+ drbd_init_ee()
+ drbd_release_ee()
+ drbd_ee_fix_bhs()
+ drbd_process_ee()
+ drbd_clear_done_ee()
+ drbd_wait_ee()
+*/
+
+STATIC int _drbd_alloc_ee(drbd_dev *mdev,struct page* page,int mask)
+{
+	struct Tl_epoch_entry* e;
+
+	e = kmem_cache_alloc(drbd_ee_cache, mask);
+	if( e == NULL ) return FALSE;
+
+	drbd_ee_init(e,page);
+	spin_lock_irq(&mdev->ee_lock);
+	list_add(&e->w.list,&mdev->free_ee);
+	mdev->ee_vacant++;
+	spin_unlock_irq(&mdev->ee_lock);
+
+	return TRUE;
+}
+
+/* bool */
+STATIC int drbd_alloc_ee(drbd_dev *mdev,int mask)
+{
+	struct page *page;
+
+	page=alloc_page(mask);
+	if(!page) return FALSE;
+
+	if(!_drbd_alloc_ee(mdev,page,GFP_KERNEL)) {
+		__free_page(page);
+		return FALSE;
+	}
+
+	return TRUE;
+}
+
+STATIC struct page* drbd_free_ee(drbd_dev *mdev, struct list_head *list)
+{
+	struct list_head *le;
+	struct Tl_epoch_entry* e;
+	struct page* page;
+
+	MUST_HOLD(&mdev->ee_lock);
+
+	D_ASSERT(!list_empty(list));
+	le = list->next;
+	e = list_entry(le, struct Tl_epoch_entry, w.list);
+	list_del(le);
+
+	page = drbd_bio_get_page(&e->private_bio);
+ONLY_IN_26(
+	D_ASSERT(page == e->ee_bvec.bv_page);
+	page = e->ee_bvec.bv_page;
+)
+	kmem_cache_free(drbd_ee_cache, e);
+	mdev->ee_vacant--;
+
+	return page;
+}
+
+int drbd_init_ee(drbd_dev *mdev)
+{
+	while(mdev->ee_vacant < EE_MININUM ) {
+		if(!drbd_alloc_ee(mdev,GFP_USER)) {
+			ERR("Failed to allocate %d EEs !\n",EE_MININUM);
+			return 0;
+		}
+	}
+	return 1;
+}
+
+int drbd_release_ee(drbd_dev *mdev,struct list_head* list)
+{
+	int count=0;
+
+	spin_lock_irq(&mdev->ee_lock);
+	while(!list_empty(list)) {
+		__free_page(drbd_free_ee(mdev,list));
+		count++;
+	}
+	spin_unlock_irq(&mdev->ee_lock);
+
+	return count;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)
+#define GFP_TRY	( __GFP_HIGHMEM | __GFP_NOWARN )
+#else
+#define GFP_TRY	( __GFP_HIGHMEM )
+#endif
+
+STATIC int _drbd_process_ee(drbd_dev *mdev, int be_sleepy);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+STATIC void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+	unsigned long flags;
+
+	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
+	spin_lock_irqsave(&q->lock, flags);
+	if (list_empty(&wait->task_list))
+		__add_wait_queue(q, wait);
+	set_current_state(state);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+STATIC void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
+{
+	unsigned long flags;
+
+	__set_current_state(TASK_RUNNING);
+
+	spin_lock_irqsave(&q->lock, flags);
+	list_del_init(&wait->task_list);
+	spin_unlock_irqrestore(&q->lock, flags);
+}
+
+#define DEFINE_WAIT(name)						\
+	wait_queue_t name = {						\
+		.task		= current,				\
+		.task_list	= {	.next = &name.task_list,	\
+					.prev = &name.task_list,	\
+				},					\
+	}
+
+#endif
+
+/**
+ * drbd_get_ee: Returns an Tl_epoch_entry; might sleep. Fails only if
+ * a signal comes in.
+ */
+struct Tl_epoch_entry* drbd_get_ee(drbd_dev *mdev)
+{
+	struct list_head *le;
+	struct Tl_epoch_entry* e;
+	DEFINE_WAIT(wait);
+
+	MUST_HOLD(&mdev->ee_lock);
+
+	if(mdev->ee_vacant == EE_MININUM / 2) {
+		spin_unlock_irq(&mdev->ee_lock);
+		drbd_kick_lo(mdev);
+		spin_lock_irq(&mdev->ee_lock);
+	}
+
+	if(list_empty(&mdev->free_ee)) _drbd_process_ee(mdev,1);
+
+	if(list_empty(&mdev->free_ee)) {
+		for (;;) {
+			prepare_to_wait(&mdev->ee_wait, &wait, 
+					TASK_INTERRUPTIBLE);
+			if(!list_empty(&mdev->free_ee)) break;
+			spin_unlock_irq(&mdev->ee_lock);
+			if( ( mdev->ee_vacant+mdev->ee_in_use) < 
+			      mdev->conf.max_buffers ) {
+				if(drbd_alloc_ee(mdev,GFP_TRY)) {
+					spin_lock_irq(&mdev->ee_lock);
+					break;
+				}
+			}
+			drbd_kick_lo(mdev);
+			schedule();
+			spin_lock_irq(&mdev->ee_lock);
+			finish_wait(&mdev->ee_wait, &wait);
+			if (signal_pending(current)) {
+				WARN("drbd_get_ee interrupted!\n");
+				return 0;
+			}
+			// finish wait is inside, so that we are TASK_RUNNING 
+			// in _drbd_process_ee (which might sleep by itself.)
+			_drbd_process_ee(mdev,1);
+		}
+		finish_wait(&mdev->ee_wait, &wait); 
+	}
+
+	le=mdev->free_ee.next;
+	list_del(le);
+	mdev->ee_vacant--;
+	mdev->ee_in_use++;
+	e=list_entry(le, struct Tl_epoch_entry, w.list);
+ONLY_IN_26(
+	D_ASSERT(e->private_bio.bi_idx == 0);
+	drbd_ee_init(e,e->ee_bvec.bv_page); // reinitialize
+)
+	e->block_id = !ID_VACANT;
+	SET_MAGIC(e);
+	return e;
+}
+
+void drbd_put_ee(drbd_dev *mdev,struct Tl_epoch_entry *e)
+{
+	struct page* page;
+
+	MUST_HOLD(&mdev->ee_lock);
+
+	D_ASSERT(page_count(drbd_bio_get_page(&e->private_bio)) == 1);
+
+	mdev->ee_in_use--;
+	mdev->ee_vacant++;
+	e->block_id = ID_VACANT;
+	INVALIDATE_MAGIC(e);
+	list_add_tail(&e->w.list,&mdev->free_ee);
+
+	if((mdev->ee_vacant * 2 > mdev->ee_in_use ) &&
+	   ( mdev->ee_vacant + mdev->ee_in_use > EE_MININUM) ) {
+		// FIXME cleanup: never returns NULL anymore
+		page=drbd_free_ee(mdev,&mdev->free_ee);
+		if( page ) __free_page(page);
+	}
+	if(mdev->ee_in_use == 0) {
+		while( mdev->ee_vacant > EE_MININUM ) {
+			__free_page(drbd_free_ee(mdev,&mdev->free_ee));
+		}
+	}
+
+	wake_up(&mdev->ee_wait);
+}
+
+STATIC void reclaim_net_ee(drbd_dev *mdev)
+{
+	struct Tl_epoch_entry *e;
+	struct list_head *le,*tle;
+
+	/* The EEs are always appended to the end of the list, since
+	   they are sent in order over the wire, they have to finish
+	   in order. As soon as we see the first not finished we can
+	   stop to examine the list... */
+
+	list_for_each_safe(le, tle, &mdev->net_ee) {
+		e = list_entry(le, struct Tl_epoch_entry, w.list);
+		if( page_count(drbd_bio_get_page(&e->private_bio)) > 1 ) break;
+		list_del(le);
+		drbd_put_ee(mdev,e);
+	}
+}
+
+
+/* It is important that the head list is really empty when returning,
+   from this function. Note, this function is called from all three
+   threads (receiver, worker and asender). To ensure this I only allow
+   one thread at a time in the body of the function */
+STATIC int _drbd_process_ee(drbd_dev *mdev, int be_sleepy)
+{
+	struct Tl_epoch_entry *e;
+	struct list_head *head = &mdev->done_ee;
+	struct list_head *le;
+	int ok=1;
+	int got_sig;
+
+	MUST_HOLD(&mdev->ee_lock);
+
+	reclaim_net_ee(mdev);
+
+	if( test_and_set_bit(PROCESS_EE_RUNNING,&mdev->flags) ) {
+		if(!be_sleepy) {
+			return 3;
+		}
+		spin_unlock_irq(&mdev->ee_lock);
+		got_sig = wait_event_interruptible(mdev->ee_wait,
+		       test_and_set_bit(PROCESS_EE_RUNNING,&mdev->flags) == 0);
+		spin_lock_irq(&mdev->ee_lock);
+		if(got_sig) return 2;
+	}
+
+	while(!list_empty(head)) {
+		le = head->next;
+		list_del(le);
+		spin_unlock_irq(&mdev->ee_lock);
+		e = list_entry(le, struct Tl_epoch_entry, w.list);
+		ok = ok && e->w.cb(mdev,&e->w,0);
+		spin_lock_irq(&mdev->ee_lock);
+		drbd_put_ee(mdev,e);
+	}
+
+	clear_bit(PROCESS_EE_RUNNING,&mdev->flags);
+	wake_up(&mdev->ee_wait);
+
+	return ok;
+}
+
+STATIC int drbd_process_ee(drbd_dev *mdev, int be_sleepy)
+{
+	int rv;
+	spin_lock_irq(&mdev->ee_lock);
+	rv=_drbd_process_ee(mdev,be_sleepy);
+	spin_unlock_irq(&mdev->ee_lock);
+	return rv;
+}
+
+STATIC void drbd_clear_done_ee(drbd_dev *mdev)
+{
+	struct list_head *le;
+	struct Tl_epoch_entry *e;
+	int n = 0;
+
+	spin_lock_irq(&mdev->ee_lock);
+
+	reclaim_net_ee(mdev);
+
+	while(!list_empty(&mdev->done_ee)) {
+		le = mdev->done_ee.next;
+		list_del(le);
+		e = list_entry(le, struct Tl_epoch_entry, w.list);
+		if(mdev->conf.wire_protocol == DRBD_PROT_C ||
+		   is_syncer_blk(mdev,e->block_id)) {
+			++n;
+		}
+		drbd_put_ee(mdev,e);
+	}
+
+	spin_unlock_irq(&mdev->ee_lock);
+
+	sub_unacked(mdev, n);
+}
+
+
+static inline int _wait_ee_cond(struct Drbd_Conf* mdev,struct list_head *head)
+{
+	int rv;
+	spin_lock_irq(&mdev->ee_lock);
+	rv = list_empty(head);
+	spin_unlock_irq(&mdev->ee_lock);
+	if(!rv) drbd_kick_lo(mdev);
+	return rv;
+}
+
+void drbd_wait_ee(drbd_dev *mdev,struct list_head *head)
+{
+	wait_event(mdev->ee_wait,_wait_ee_cond(mdev,head));
+}
+
+STATIC struct socket* drbd_accept(drbd_dev *mdev,struct socket* sock)
+{
+	struct socket *newsock;
+	int err = 0;
+
+	err = sock->ops->listen(sock, 5);
+	if (err)
+		goto out;
+
+	if (sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock))
+		goto out;
+
+	newsock->type = sock->type;
+	newsock->ops  = sock->ops;
+
+	err = newsock->ops->accept(sock, newsock, 0);
+	if (err < 0)
+		goto out_release;
+
+	return newsock;
+
+      out_release:
+	sock_release(newsock);
+      out:
+	if(err != -EAGAIN && err != -EINTR)
+		ERR("accept failed! %d\n", err);
+	return 0;
+}
+
+STATIC int drbd_recv_short(drbd_dev *mdev, void *buf, size_t size)
+{
+	mm_segment_t oldfs;
+	struct iovec iov;
+	struct msghdr msg;
+	int rv;
+
+	if (unlikely(drbd_did_panic == DRBD_MAGIC)) {
+		drbd_suicide();
+	}
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = &iov;
+	iov.iov_len = size;
+	iov.iov_base = buf;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	rv = sock_recvmsg(mdev->meta.socket, &msg, size, msg.msg_flags);
+
+	set_fs(oldfs);
+
+	return rv;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+# define SK_(x)		x
+#else
+# define SK_(x)		sk_ ## x
+#endif
+
+int drbd_recv(drbd_dev *mdev,void *buf, size_t size)
+{
+	mm_segment_t oldfs;
+	struct iovec iov;
+	struct msghdr msg;
+	int rv;
+
+	if (unlikely(drbd_did_panic == DRBD_MAGIC)) {
+		drbd_suicide();
+	}
+
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_iovlen = 1;
+	msg.msg_iov = &iov;
+	iov.iov_len = size;
+	iov.iov_base = buf;
+	msg.msg_name = NULL;
+	msg.msg_namelen = 0;
+	msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	for(;;) {
+		rv = sock_recvmsg(mdev->data.socket,&msg,size,msg.msg_flags);
+		if (rv == size) break;
+
+		/* Note:
+		 * ECONNRESET   other side closed the connection
+		 * ERESTARTSYS  (on  sock) we got a signal
+		 */
+
+		if (rv < 0) {
+			if (rv == -ECONNRESET)
+				INFO("sock was reset by peer\n");
+			else if (rv != -ERESTARTSYS)
+				ERR("sock_recvmsg returned %d\n",rv);
+			break;
+		} else if (rv == 0) {
+			INFO("sock was shut down by peer\n");
+			break;
+		} else  {
+			/* signal came in, or peer/link went down,
+			 * after we read a partial message
+			 */
+			// D_ASSERT(signal_pending(current));
+			break;
+		}
+	};
+
+	set_fs(oldfs);
+
+	if(rv != size) {
+		set_cstate(mdev,BrokenPipe);
+		drbd_thread_restart_nowait(&mdev->receiver);
+	}
+
+	return rv;
+}
+
+STATIC struct socket *drbd_try_connect(drbd_dev *mdev)
+{
+	int err;
+	struct socket *sock;
+	struct sockaddr_in src_in;
+
+	err = sock_create(AF_INET, SOCK_STREAM, 0, &sock);
+	if (err) {
+		ERR("sock_creat(..)=%d\n", err);
+		return NULL;
+	}
+
+	sock->sk->SK_(rcvtimeo) =
+	sock->sk->SK_(sndtimeo) =  mdev->conf.try_connect_int*HZ;
+
+       /* explicitly bind to the configured IP as source IP 
+	   for the outgoing connections.
+	   This is needed for multihomed hosts and to be 
+	   able to use lo: interfaces for drbd.
+          Make sure to use 0 as portnumber, so linux selects
+	   a free one dynamically.
+	*/
+	memcpy (&src_in, &(mdev->conf.my_addr), sizeof(struct sockaddr_in));
+	src_in.sin_port = 0; 
+
+	err = sock->ops->bind(sock,
+			      (struct sockaddr * ) &src_in,
+			      sizeof (struct sockaddr_in));
+	if (err) {
+		ERR("Unable to bind source sock (%d)\n", err);
+		sock_release(sock);
+		sock = NULL;
+		return sock;
+	}
+
+	err = sock->ops->connect(sock,
+				 (struct sockaddr *) mdev->conf.other_addr,
+				 mdev->conf.other_addr_len, 0);
+
+	if (err) {
+		sock_release(sock);
+		sock = NULL;
+	}
+	return sock;
+}
+
+STATIC struct socket *drbd_wait_for_connect(drbd_dev *mdev)
+{
+	int err;
+	struct socket *sock,*sock2;
+
+	err = sock_create(AF_INET, SOCK_STREAM, 0, &sock2);
+	if (err) {
+		ERR("sock_creat(..)=%d\n", err);
+		return NULL;
+	}
+
+	sock2->sk->SK_(reuse)    = 1; /* SO_REUSEADDR */
+	sock2->sk->SK_(rcvtimeo) =
+	sock2->sk->SK_(sndtimeo) =  mdev->conf.try_connect_int*HZ;
+
+	err = sock2->ops->bind(sock2,
+			      (struct sockaddr *) mdev->conf.my_addr,
+			      mdev->conf.my_addr_len);
+	if (err) {
+		ERR("Unable to bind sock2 (%d)\n", err);
+		sock_release(sock2);
+		set_cstate(mdev,Unconnected);
+		return 0;
+	}
+
+	sock = drbd_accept(mdev,sock2);
+	sock_release(sock2);
+
+	return sock;
+}
+
+STATIC int drbd_do_handshake(drbd_dev *mdev);
+
+/*
+ * return values:
+ *   1 yess, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+int drbd_connect(drbd_dev *mdev)
+{
+	struct socket *sock,*msock;
+	int h;
+
+	D_ASSERT(mdev->cstate!=Unconfigured);
+	D_ASSERT(!mdev->data.socket);
+
+	set_cstate(mdev,WFConnection);
+
+	while(1) {
+		sock=drbd_try_connect(mdev);
+		if(sock) {
+			msock=drbd_wait_for_connect(mdev);
+			if(msock) break;
+			else sock_release(sock);
+		} else {
+			sock=drbd_wait_for_connect(mdev);
+			if(sock) {
+				int retry;
+				for (retry=1; retry <= 10; retry++) {
+					// give the other side time to call
+					// bind() & listen()
+					set_current_state(TASK_INTERRUPTIBLE);
+					schedule_timeout(HZ / 10);
+					msock=drbd_try_connect(mdev);
+					if(msock) goto connected;
+					ERR("msock try_connect %d\n",retry);
+				}
+				sock_release(sock);
+			}
+		}
+		if(mdev->cstate==Unconnected) return -1;
+		if(signal_pending(current)) {
+			drbd_flush_signals(current);
+			smp_rmb();
+			if (get_t_state(&mdev->receiver) == Exiting)
+				return -1;
+		}
+	}
+
+ connected:
+
+	msock->sk->SK_(reuse)=1; /* SO_REUSEADDR */
+	sock->sk->SK_(reuse)=1; /* SO_REUSEADDR */
+
+	/* to prevent oom deadlock... */
+	/* The default allocation priority was GFP_KERNEL */
+	sock->sk->SK_(allocation) = GFP_DRBD;
+	msock->sk->SK_(allocation) = GFP_DRBD;
+
+	sock->sk->SK_(priority)=TC_PRIO_BULK;
+	NOT_IN_26(sock->sk->tp_pinfo.af_tcp.nonagle=0;)
+	ONLY_IN_26( tcp_sk(sock->sk)->nonagle = 0;)
+	// FIXME fold to limits. should be done in drbd_ioctl
+	sock->sk->SK_(sndbuf) = mdev->conf.sndbuf_size;
+	sock->sk->SK_(rcvbuf) = mdev->conf.sndbuf_size;
+	/* NOT YET ...
+	 * sock->sk->SK_(sndtimeo) = mdev->conf.timeout*HZ/20;
+	 * sock->sk->SK_(rcvtimeo) = MAX_SCHEDULE_TIMEOUT;
+	 * THINK HandShake timeout, hardcoded for now: */
+	sock->sk->SK_(sndtimeo) =
+	sock->sk->SK_(rcvtimeo) = 2*HZ;
+	sock->sk->SK_(userlocks) |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK;
+
+	msock->sk->SK_(priority)=TC_PRIO_INTERACTIVE;
+	NOT_IN_26(sock->sk->tp_pinfo.af_tcp.nonagle=1;)
+	ONLY_IN_26(tcp_sk(sock->sk)->nonagle = 1;)
+	msock->sk->SK_(sndbuf) = 2*32767;
+	msock->sk->SK_(sndtimeo) = mdev->conf.timeout*HZ/20;
+	msock->sk->SK_(rcvtimeo) = mdev->conf.ping_int*HZ;
+
+	mdev->data.socket = sock;
+	mdev->meta.socket = msock;
+	mdev->last_received = jiffies;
+
+	set_cstate(mdev,WFReportParams);
+	D_ASSERT(mdev->asender.task == NULL);
+
+	h = drbd_do_handshake(mdev);
+	if (h <= 0) return h;
+
+	clear_bit(ON_PRI_INC_HUMAN,&mdev->flags);
+	clear_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags);
+
+	sock->sk->SK_(sndtimeo) = mdev->conf.timeout*HZ/20;
+	sock->sk->SK_(rcvtimeo) = MAX_SCHEDULE_TIMEOUT;
+
+	drbd_thread_start(&mdev->asender);
+
+	drbd_send_param(mdev,0);
+	clear_bit(USE_DEGR_WFC_T,&mdev->flags);
+
+	return 1;
+}
+
+STATIC int drbd_recv_header(drbd_dev *mdev, Drbd_Header *h)
+{
+	int r;
+
+	r = drbd_recv(mdev,h,sizeof(*h));
+
+	if (unlikely( r != sizeof(*h) )) {
+		ERR("short read expecting header on sock: r=%d\n",r);
+		return FALSE;
+	};
+	h->command = be16_to_cpu(h->command);
+	h->length  = be16_to_cpu(h->length);
+	if (unlikely( h->magic != BE_DRBD_MAGIC )) {
+		ERR("magic?? m: 0x%lx c: %d l: %d\n",
+		    (long)be32_to_cpu(h->magic),
+		    h->command, h->length);
+		return FALSE;
+	}
+	mdev->last_received = jiffies;
+
+	return TRUE;
+}
+
+STATIC int receive_Barrier(drbd_dev *mdev, Drbd_Header* h)
+{
+	int rv;
+	int epoch_size;
+	Drbd_Barrier_Packet *p = (Drbd_Barrier_Packet*)h;
+
+	ERR_IF(mdev->state != Secondary) return FALSE;
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+
+	rv = drbd_recv(mdev, h->payload, h->length);
+	ERR_IF(rv != h->length) return FALSE;
+
+	inc_unacked(mdev);
+
+	// DBG("got Barrier\n");
+
+	if (mdev->conf.wire_protocol != DRBD_PROT_C)
+		drbd_kick_lo(mdev);
+
+	drbd_wait_ee(mdev,&mdev->active_ee);
+
+	spin_lock_irq(&mdev->ee_lock);
+	rv = _drbd_process_ee(mdev,1);
+
+	epoch_size=atomic_read(&mdev->epoch_size);
+	atomic_set(&mdev->epoch_size,0);
+	spin_unlock_irq(&mdev->ee_lock);
+
+	rv &= drbd_send_b_ack(mdev, p->barrier, epoch_size);
+	dec_unacked(mdev);
+
+	return rv;
+}
+
+STATIC struct Tl_epoch_entry *
+read_in_block(drbd_dev *mdev, int data_size)
+{
+	struct Tl_epoch_entry *e;
+	drbd_bio_t *bio;
+	int rr;
+
+	spin_lock_irq(&mdev->ee_lock);
+	e=drbd_get_ee(mdev);
+	spin_unlock_irq(&mdev->ee_lock);
+	if(!e) return 0;
+
+	bio = &e->private_bio;
+
+	rr=drbd_recv(mdev, drbd_bio_kmap(bio), data_size);
+	drbd_bio_kunmap(bio);
+
+	if ( rr != data_size) {
+		NOT_IN_26(clear_bit(BH_Lock, &bio->b_state);)
+		spin_lock_irq(&mdev->ee_lock);
+		drbd_put_ee(mdev,e);
+		spin_unlock_irq(&mdev->ee_lock);
+		WARN("short read receiving data block: read %d expected %d\n",
+			rr, data_size);
+		return 0;
+	}
+	mdev->recv_cnt+=data_size>>9;
+
+	return e;
+}
+
+STATIC void receive_data_tail(drbd_dev *mdev,int data_size)
+{
+	/* kick lower level device, if we have more than (arbitrary number)
+	 * reference counts on it, which typically are locally submitted io
+	 * requests.  don't use unacked_cnt, so we speed up proto A and B, too.
+	 *
+	 * XXX maybe: make that arbitrary number configurable.
+	 * for now, I choose 1/16 of max-epoch-size.
+	 */
+	if (atomic_read(&mdev->local_cnt) >= (mdev->conf.max_epoch_size>>4) ) {
+		drbd_kick_lo(mdev);
+	}
+	mdev->writ_cnt+=data_size>>9;
+}
+
+STATIC int recv_dless_read(drbd_dev *mdev, drbd_request_t *req,
+			   sector_t sector, int data_size)
+{
+	drbd_bio_t *bio;
+	int ok,rr;
+
+	bio = req->master_bio;
+
+	D_ASSERT( sector == drbd_req_get_sector(req) );
+
+	rr=drbd_recv(mdev,drbd_bio_kmap(bio),data_size);
+	drbd_bio_kunmap(bio);
+
+	ok=(rr==data_size);
+	drbd_bio_endio(bio,ok);
+	dec_ap_bio(mdev);
+
+	dec_ap_pending(mdev);
+	return ok;
+}
+
+STATIC int e_end_resync_block(drbd_dev *mdev, struct drbd_work *w, int unused)
+{
+	struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w;
+	sector_t sector = drbd_ee_get_sector(e);
+	int ok;
+
+	drbd_rs_complete_io(mdev,sector); // before set_in_sync() !
+	if (likely( drbd_bio_uptodate(&e->private_bio) )) {
+		ok = !test_bit(DISKLESS,&mdev->flags) &&
+		     !test_bit(PARTNER_DISKLESS,&mdev->flags);
+		if (likely( ok )) {
+			drbd_set_in_sync(mdev, sector, drbd_ee_get_size(e));
+			/* THINK maybe don't send ack either
+			 * when we are suddenly diskless?
+			 * Dropping it here should do no harm,
+			 * since peer has no structs referencing this.
+			 */
+		}
+		ok = drbd_send_ack(mdev,WriteAck,e);
+		set_bit(SYNC_STARTED,&mdev->flags);
+	} else {
+		ok = drbd_send_ack(mdev,NegAck,e);
+		ok&= drbd_io_error(mdev);
+	}
+	dec_unacked(mdev);
+
+	return ok;
+}
+
+STATIC int recv_resync_read(drbd_dev *mdev,sector_t sector, int data_size)
+{
+	struct Tl_epoch_entry *e;
+
+	e = read_in_block(mdev,data_size);
+	if(!e) return FALSE;
+
+	dec_rs_pending(mdev);
+
+	e->block_id = ID_SYNCER;
+	if(!inc_local(mdev)) {
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("Can not write resync data to local disk.\n");
+		drbd_send_ack(mdev,NegAck,e);
+		spin_lock_irq(&mdev->ee_lock);
+		drbd_put_ee(mdev,e);
+		spin_unlock_irq(&mdev->ee_lock);
+		return TRUE;
+	}
+
+	drbd_ee_prepare_write(mdev,e,sector,data_size);
+	e->w.cb     = e_end_resync_block;
+
+	spin_lock_irq(&mdev->ee_lock);
+	list_add(&e->w.list,&mdev->sync_ee);
+	spin_unlock_irq(&mdev->ee_lock);
+
+	inc_unacked(mdev);
+
+	drbd_generic_make_request(WRITE,&e->private_bio);
+
+	receive_data_tail(mdev,data_size);
+	return TRUE;
+}
+
+STATIC int receive_DataReply(drbd_dev *mdev,Drbd_Header* h)
+{
+	drbd_request_t *req;
+	sector_t sector;
+	unsigned int header_size,data_size;
+	int ok;
+	Drbd_Data_Packet *p = (Drbd_Data_Packet*)h;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	/* I expect a block to be a multiple of 512 byte, and
+	 * no more than 4K (PAGE_SIZE). is this too restrictive?
+	 */
+	ERR_IF(data_size == 0) return FALSE;
+	ERR_IF(data_size &  0x1ff) return FALSE;
+	ERR_IF(data_size >  PAGE_SIZE) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+
+	req = (drbd_request_t *)(long)p->block_id;
+	D_ASSERT(req->w.cb == w_is_app_read);
+
+	spin_lock(&mdev->pr_lock);
+	list_del(&req->w.list);
+	spin_unlock(&mdev->pr_lock);
+
+	ok = recv_dless_read(mdev,req,sector,data_size);
+
+	INVALIDATE_MAGIC(req);
+	mempool_free(req,drbd_request_mempool);
+
+	return ok;
+}
+
+STATIC int receive_RSDataReply(drbd_dev *mdev,Drbd_Header* h)
+{
+	sector_t sector;
+	unsigned int header_size,data_size;
+	int ok;
+	Drbd_Data_Packet *p = (Drbd_Data_Packet*)h;
+
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	/* I expect a block to be a multiple of 512 byte, and
+	 * no more than 4K (PAGE_SIZE). is this too restrictive?
+	 */
+	ERR_IF(data_size == 0) return FALSE;
+	ERR_IF(data_size &  0x1ff) return FALSE;
+	ERR_IF(data_size >  PAGE_SIZE) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(p->block_id == ID_SYNCER);
+
+	ok = recv_resync_read(mdev,sector,data_size);
+
+	return ok;
+}
+
+STATIC int e_end_block(drbd_dev *mdev, struct drbd_work *w, int unused)
+{
+	struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w;
+	sector_t sector = drbd_ee_get_sector(e);
+	int ok=1;
+
+	atomic_inc(&mdev->epoch_size);
+	if(mdev->conf.wire_protocol == DRBD_PROT_C) {
+		if(likely(drbd_bio_uptodate(&e->private_bio))) {
+			ok=drbd_send_ack(mdev,WriteAck,e);
+			if (ok && test_bit(SYNC_STARTED,&mdev->flags) )
+				drbd_set_in_sync(mdev,sector,drbd_ee_get_size(e));
+		} else {
+			ok = drbd_send_ack(mdev,NegAck,e);
+			ok&= drbd_io_error(mdev);
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?
+			 */
+		}
+		dec_unacked(mdev);
+
+		return ok;
+	}
+
+	if(unlikely(!drbd_bio_uptodate(&e->private_bio))) {
+		ok = drbd_io_error(mdev);
+	}
+
+	return ok;
+}
+
+// mirrored write
+STATIC int receive_Data(drbd_dev *mdev,Drbd_Header* h)
+{
+	sector_t sector;
+	struct Tl_epoch_entry *e;
+	Drbd_Data_Packet *p = (Drbd_Data_Packet*)h;
+	int header_size,data_size;
+
+	// FIXME merge this code dups into some helper function
+	header_size = sizeof(*p) - sizeof(*h);
+	data_size   = h->length  - header_size;
+
+	/* I expect a block to be a multiple of 512 byte, and
+	 * no more than 4K (PAGE_SIZE). is this too restrictive?
+	 */
+	ERR_IF(data_size == 0) return FALSE;
+	ERR_IF(data_size &  0x1ff) return FALSE;
+	ERR_IF(data_size >  PAGE_SIZE) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, header_size) != header_size)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+
+	e = read_in_block(mdev,data_size);
+	if (!e) return FALSE;
+	e->block_id = p->block_id; // no meaning on this side, e* on partner
+
+	if(!inc_local(mdev)) {
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("Can not write mirrored data block to local disk.\n");
+		drbd_send_ack(mdev,NegAck,e);
+		spin_lock_irq(&mdev->ee_lock);
+		drbd_put_ee(mdev,e);
+		spin_unlock_irq(&mdev->ee_lock);
+		return TRUE;
+	}
+
+	drbd_ee_prepare_write(mdev, e, sector, data_size);
+	e->w.cb     = e_end_block;
+
+	spin_lock_irq(&mdev->ee_lock);
+	list_add(&e->w.list,&mdev->active_ee);
+	spin_unlock_irq(&mdev->ee_lock);
+
+	switch(mdev->conf.wire_protocol) {
+	case DRBD_PROT_C:
+		inc_unacked(mdev);
+		break;
+	case DRBD_PROT_B:
+		drbd_send_ack(mdev, RecvAck, e);
+		break;
+	case DRBD_PROT_A:
+		// nothing to do
+		break;
+	}
+
+	drbd_generic_make_request(WRITE,&e->private_bio);
+
+	receive_data_tail(mdev,data_size);
+	return TRUE;
+}
+
+STATIC int receive_DataRequest(drbd_dev *mdev,Drbd_Header *h)
+{
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	struct Tl_epoch_entry *e;
+	int size;
+	Drbd_BlockRequest_Packet *p = (Drbd_BlockRequest_Packet*)h;
+
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	/*
+	 * handled by NegDReply below ...
+	ERR_IF (test_bit(DISKLESS,&mdev->flags)) {
+		return FALSE;
+	ERR_IF ( (mdev->gen_cnt[Flags] & MDF_Consistent) == 0 )
+		return FALSE;
+	*/
+
+	if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) {
+		ERR("%s:%d: sector: %lu, size: %d\n", __FILE__, __LINE__,
+				(unsigned long)sector,size);
+		return FALSE;
+	}
+	if ( sector + (size>>9) > capacity) {
+		ERR("%s:%d: sector: %lu, size: %d\n", __FILE__, __LINE__,
+				(unsigned long)sector,size);
+		return FALSE;
+	}
+
+	spin_lock_irq(&mdev->ee_lock);
+	e=drbd_get_ee(mdev);
+	if(!e) {
+		spin_unlock_irq(&mdev->ee_lock);
+		return FALSE;
+	}
+	e->block_id = p->block_id; // no meaning on this side, pr* on partner
+	list_add(&e->w.list,&mdev->read_ee);
+	spin_unlock_irq(&mdev->ee_lock);
+
+	if(!inc_local(mdev) || (mdev->gen_cnt[Flags] & MDF_Consistent) == 0) {
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("Can not satisfy peer's read request, no local data.\n");
+		drbd_send_ack(mdev,NegDReply,e);
+		spin_lock_irq(&mdev->ee_lock);
+		drbd_put_ee(mdev,e);
+		spin_unlock_irq(&mdev->ee_lock);
+		return TRUE;
+	}
+
+	drbd_ee_prepare_read(mdev,e,sector,size);
+
+	switch (h->command) {
+	case DataRequest:
+		e->w.cb = w_e_end_data_req;
+		break;
+	case RSDataRequest:
+		e->w.cb = w_e_end_rsdata_req;
+		/* Eventually this should become asynchrously. Currently it
+		 * blocks the whole receiver just to delay the reading of a
+		 * resync data block.
+		 * the drbd_work_queue mechanism is made for this...
+		 */
+		if (!drbd_rs_begin_io(mdev,sector)) {
+			// we have been interrupted, probably connection lost!
+			D_ASSERT(signal_pending(current));
+			drbd_put_ee(mdev,e);
+			return 0;
+		}
+		break;
+	default:
+		ERR("unexpected command (%s) in receive_DataRequest\n",
+		    cmdname(h->command));
+	}
+
+	mdev->read_cnt += size >> 9;
+	inc_unacked(mdev);
+	drbd_generic_make_request(READ,&e->private_bio);
+	if (atomic_read(&mdev->local_cnt) >= (mdev->conf.max_epoch_size>>4) ) {
+		drbd_kick_lo(mdev);
+	}
+
+
+	return TRUE;
+}
+
+STATIC int receive_SyncParam(drbd_dev *mdev,Drbd_Header *h)
+{
+	int ok = TRUE;
+	Drbd_SyncParam_Packet *p = (Drbd_SyncParam_Packet*)h;
+
+	// FIXME move into helper
+	ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	// XXX harmless race with ioctl ...
+	mdev->sync_conf.rate      = be32_to_cpu(p->rate);
+	mdev->sync_conf.use_csums = be32_to_cpu(p->use_csums);
+	mdev->sync_conf.skip      = be32_to_cpu(p->skip);
+	drbd_alter_sg(mdev, be32_to_cpu(p->group));
+
+	if (   (mdev->cstate == SkippedSyncS || mdev->cstate == SkippedSyncT)
+	    && !mdev->sync_conf.skip )
+	{
+		set_cstate(mdev,WFReportParams);
+		ok = drbd_send_param(mdev,0);
+	}
+
+	return ok;
+}
+
+STATIC int drbd_sync_handshake(drbd_dev *mdev, Drbd_Parameter_Packet *p)
+{
+	int have_good,sync;
+
+	have_good = drbd_md_compare(mdev,p);
+
+	if(have_good==0) {
+		if (drbd_md_test_flag(mdev,MDF_PrimaryInd)) {
+			/* gen counts compare the same, but I have the
+			 * PrimaryIndicator set.  so the peer has, too
+			 * (otherwise this would not compare the same).
+			 * so we had a split brain!
+			 *
+			 * FIXME maybe log MDF_SplitBran into metadata,
+			 * and refuse to do anything until told otherwise!
+			 *
+			 * for now: just go StandAlone.
+			 */
+			ALERT("Split-Brain detected, dropping connection!\n");
+			set_cstate(mdev,StandAlone);
+			drbd_thread_stop_nowait(&mdev->receiver);
+			return FALSE;
+		}
+		sync=0;
+	} else {
+		sync=1;
+	}
+
+	drbd_dump_md(mdev,p,0);
+	// INFO("have_good=%d sync=%d\n", have_good, sync);
+
+	if (have_good > 0 && !drbd_md_test_flag(mdev,MDF_Consistent)) {
+		/* doh. I cannot become SyncSource when I am inconsistent!
+		 */
+		ERR("I shall become SyncSource, but I am inconsistent!\n");
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+	if (have_good < 0 &&
+	    !(be32_to_cpu(p->gen_cnt[Flags]) & MDF_Consistent) ) {
+		/* doh. Peer cannot become SyncSource when inconsistent
+		 */
+		ERR("I shall become SyncTarget, but Peer is inconsistent!\n");
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	if ( mdev->sync_conf.skip && sync ) {
+		if (have_good == 1)
+			set_cstate(mdev,SkippedSyncS);
+		else // have_good == -1
+			set_cstate(mdev,SkippedSyncT);
+		return TRUE;
+	}
+
+	if( sync ) {
+		if(have_good == 1) {
+			D_ASSERT(drbd_md_test_flag(mdev,MDF_Consistent));
+			set_cstate(mdev,WFBitMapS);
+			wait_event(mdev->cstate_wait,
+			     atomic_read(&mdev->ap_bio_cnt)==0);
+			drbd_bm_lock(mdev);   // {
+			drbd_send_bitmap(mdev);
+			drbd_bm_unlock(mdev); // }
+		} else { // have_good == -1
+			if ( (mdev->state == Primary) &&
+			     drbd_md_test_flag(mdev,MDF_Consistent) ) {
+				/* FIXME
+				 * allow Primary become SyncTarget if it was
+				 * diskless, and now had a storage reattached.
+				 * only somewhere the MDF_Consistent flag is
+				 * set where it should not... I think.
+				 */
+				ERR("Current Primary shall become sync TARGET!"
+				    " Aborting to prevent data corruption.\n");
+				set_cstate(mdev,StandAlone);
+				drbd_thread_stop_nowait(&mdev->receiver);
+				return FALSE;
+			}
+			drbd_md_clear_flag(mdev,MDF_Consistent);
+			set_cstate(mdev,WFBitMapT);
+		}
+	} else {
+		set_cstate(mdev,Connected);
+		drbd_bm_lock(mdev);   // {
+		if(drbd_bm_total_weight(mdev)) {
+			if (drbd_md_test_flag(mdev,MDF_Consistent)) {
+				/* We are not going to do a resync but there
+				   are marks in the bitmap.
+				   (Could be from the AL, or someone used
+				   the write_gc.pl program)
+				   Clean the bitmap...
+				 */
+				INFO("No resync -> clearing bit map.\n");
+				drbd_bm_clear_all(mdev);
+				drbd_bm_write(mdev);
+			} else {
+				WARN("I am inconsistent, but there is no sync? BOTH nodes inconsistent!\n");
+			}
+		}
+		drbd_bm_unlock(mdev); // }
+	}
+
+	if (have_good == -1) {
+		/* Sync-Target has to adopt source's gen_cnt. */
+		int i;
+		for(i=HumanCnt;i<GEN_CNT_SIZE;i++) {
+			mdev->gen_cnt[i]=be32_to_cpu(p->gen_cnt[i]);
+		}
+	}
+	return TRUE;
+}
+
+STATIC int receive_param(drbd_dev *mdev, Drbd_Header *h)
+{
+	Drbd_Parameter_Packet *p = (Drbd_Parameter_Packet*)h;
+	int consider_sync;
+	int oo_state,i;
+	sector_t p_size, p_usize, my_usize;
+
+	if (h->length != (sizeof(*p)-sizeof(*h))) {
+		ERR("Incompatible packet size of Parameter packet!\n");
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	if (drbd_recv(mdev, h->payload, h->length) != h->length)
+		return FALSE;
+
+	if (p->magic != BE_DRBD_MAGIC) {
+		ERR("invalid Parameter_Packet magic! Protocol version: me %d, peer %d\n",
+				PRO_VERSION, be32_to_cpu(p->version));
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	if(be32_to_cpu(p->version)!=PRO_VERSION) {
+		ERR("incompatible releases! Protocol version: me %d, peer %d\n",
+				PRO_VERSION, be32_to_cpu(p->version));
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	oo_state = be32_to_cpu(p->state);
+	if (oo_state != Primary && oo_state != Secondary) {
+		ERR("unexpected peer state: 0x%x\n", oo_state);
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	if(be32_to_cpu(p->state) == Primary && mdev->state == Primary ) {
+		ERR("incompatible states (both Primary!)\n");
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	if(be32_to_cpu(p->protocol)!=mdev->conf.wire_protocol) {
+		int peer_proto = be32_to_cpu(p->protocol);
+		if (DRBD_PROT_A <= peer_proto && peer_proto <= DRBD_PROT_C) {
+			ERR("incompatible communication protocols: "
+			    "me %c, peer %c\n",
+				'A'-1+mdev->conf.wire_protocol,
+				'A'-1+peer_proto);
+		} else {
+			ERR("incompatible communication protocols: "
+			    "me %c, peer [%d]\n",
+				'A'-1+mdev->conf.wire_protocol,
+				peer_proto);
+		}
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	p_size=be64_to_cpu(p->p_size);
+
+	if(p_size == 0 && test_bit(DISKLESS,&mdev->flags)) {
+		/* FIXME maybe allow connection,
+		 * but refuse to become primary? */
+		ERR("some backing storage is needed\n");
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		return FALSE;
+	}
+
+	drbd_bm_lock(mdev); // {
+	mdev->p_size=p_size;
+
+	set_bit(MD_DIRTY,&mdev->flags); // we are changing state!
+
+	p_usize=be64_to_cpu(p->u_size);
+	/*
+	 * you may get a flip-flop connection established/connection loss, in
+	 * case both really have different usize uppon first connect!
+	 * try to solve it thus:
+	 ***/
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+	if (mdev->cstate == WFReportParams) {
+		/* this is first connect, or an otherwise expected param
+		 * exchange.  choose the minimum */
+		p_usize = min_not_zero(mdev->lo_usize, p_usize);
+	} else {
+		/* this was an "unexpected" param packet,
+		 * just do what the peer suggests */
+	}
+#undef min_not_zero
+
+	my_usize = mdev->lo_usize;
+
+	if( mdev->lo_usize > p_usize ) {
+		mdev->lo_usize = p_usize;
+		INFO("Peer sets u_size to %lu KB\n",
+		     (unsigned long)mdev->lo_usize);
+	}
+
+	if( drbd_new_dev_size(mdev) <
+	    (drbd_get_capacity(mdev->this_bdev)>>1) &&
+	    mdev->gen_cnt[Flags] & MDF_Consistent ) {
+		ERR("The peer's disk size is too small!\n");
+		set_cstate(mdev,StandAlone);
+		drbd_thread_stop_nowait(&mdev->receiver);
+		mdev->lo_usize = my_usize;
+		return FALSE;
+	}
+
+	consider_sync = (mdev->cstate == WFReportParams);
+	drbd_determin_dev_size(mdev);
+	if(drbd_disk_less_node_present(mdev)) consider_sync=0;
+	if(test_bit(DISKLESS, &mdev->flags)) consider_sync=0;
+
+	drbd_bm_unlock(mdev); // }
+
+	if(be32_to_cpu(p->flags)&1) {
+		consider_sync=1;
+		drbd_send_param(mdev,2);
+	}
+	if(be32_to_cpu(p->flags)&2) consider_sync=1;
+
+	// XXX harmless race with ioctl ...
+	mdev->sync_conf.rate  =
+		max_t(int,mdev->sync_conf.rate, be32_to_cpu(p->sync_rate));
+
+	// if one of them wants to skip, both of them should skip.
+	mdev->sync_conf.skip  =
+		mdev->sync_conf.skip != 0 || p->skip_sync != 0;
+	mdev->sync_conf.group =
+		min_t(int,mdev->sync_conf.group,be32_to_cpu(p->sync_group));
+
+	if(!p_size) {
+		/* no point in trying to sync a diskless peer: */
+		consider_sync = 0;
+		if (!test_and_set_bit(PARTNER_DISKLESS, &mdev->flags)) {
+			/* if we got here, we *do* have a disk.
+			 * but it may be inconsistent...
+			 * anyways, record that next time we need a full sync.
+			 */
+			clear_bit(PARTNER_CONSISTENT, &mdev->flags);
+			drbd_md_set_flag(mdev,MDF_FullSync);
+			drbd_md_write(mdev);
+			/* actually we'd need to bm_fill_bm(,-1); drbd_write_bm(mdev);
+			 * but this is not necessary _now_.
+			 * we have the MDF_FullSync bit on disk.
+			 * on the next _drbd_send_bitmap this will be done.
+			 */
+			WARN("PARTNER DISKLESS\n");
+			mdev->rs_total = 0;
+		}
+		if(mdev->cstate >= Connected ) {
+			if(mdev->state == Primary) tl_clear(mdev);
+			if(mdev->state == Primary ||
+			   be32_to_cpu(p->state) == Primary ) {
+				drbd_md_inc(mdev,ConnectedCnt);
+			}
+		}
+		if(mdev->cstate > Connected ) {
+			WARN("Resync aborted.\n");
+			set_cstate(mdev,Connected);
+		}
+	} else {
+		if (test_and_clear_bit(PARTNER_DISKLESS, &mdev->flags)) {
+			WARN("Partner no longer diskless\n");
+			D_ASSERT(consider_sync);
+		}
+	}
+
+	if (be32_to_cpu(p->gen_cnt[Flags]) & MDF_Consistent) {
+		set_bit(PARTNER_CONSISTENT, &mdev->flags);
+	} else {
+		clear_bit(PARTNER_CONSISTENT, &mdev->flags);
+	}
+
+	if (mdev->cstate == WFReportParams) {
+		INFO("Connection established.\n");
+	}
+
+	if (consider_sync) {
+		if (!drbd_sync_handshake(mdev,p)) return FALSE;
+	}
+
+	if (mdev->cstate == WFReportParams) set_cstate(mdev,Connected);
+
+	oo_state = mdev->o_state;
+	mdev->o_state = be32_to_cpu(p->state);
+	if(oo_state == Secondary && mdev->o_state == Primary) {
+		/* Secondary has to adopt primary's gen_cnt. */
+		for(i=HumanCnt;i<GEN_CNT_SIZE;i++) {
+			mdev->gen_cnt[i]=be32_to_cpu(p->gen_cnt[i]);
+		}
+	}
+
+	if (oo_state != mdev->o_state) {
+		INFO( "%s/%s --> %s/%s\n",
+		      nodestate_to_name(mdev->state),
+		      nodestate_to_name(oo_state),
+		      nodestate_to_name(mdev->state),
+		      nodestate_to_name(mdev->o_state) );
+		/* FIXME assertion for (gencounts do not diverge) */
+	}
+	drbd_md_write(mdev); // update connected indicator, la_size, ...
+
+	return TRUE;
+}
+
+/* Since we are processing the bitfild from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we suceessfully received it. */
+STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h)
+{
+	size_t bm_words, bm_i, want, num_words;
+	unsigned long *buffer;
+	int ok=FALSE;
+
+	drbd_bm_lock(mdev);  // {
+
+	bm_words = drbd_bm_words(mdev);
+	bm_i     = 0;
+	buffer   = vmalloc(BM_PACKET_WORDS*sizeof(long));
+
+	while (1) {
+		num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i );
+		want = num_words * sizeof(long);
+		ERR_IF(want != h->length) goto out;
+		if (want==0) break;
+		if (drbd_recv(mdev, buffer, want) != want)
+			goto out;
+
+		drbd_bm_merge_lel(mdev, bm_i, num_words, buffer);
+		bm_i += num_words;
+
+		if (!drbd_recv_header(mdev,h))
+			goto out;
+		D_ASSERT(h->command == ReportBitMap);
+	}
+
+	if (mdev->cstate == WFBitMapS) {
+		drbd_start_resync(mdev,SyncSource);
+	} else if (mdev->cstate == WFBitMapT) {
+		ok = drbd_send_bitmap(mdev);
+		if (!ok) goto out;
+		drbd_start_resync(mdev,SyncTarget); // XXX cannot fail ???
+	} else {
+		ERR("unexpected cstate (%s) in receive_bitmap\n",
+		    cstate_to_name(mdev->cstate));
+	}
+
+	// We just started resync. Now we can be sure that local disk IO is okay.
+
+	/* no, actually we can't. failures happen asynchronously, anytime.
+	 * we can never be sure. disk may have failed while we where busy shaking hands...
+	 */
+/*
+ *  FIXME this should only be D_ASSERT here.
+ *        *doing* it here masks a logic bug elsewhere, I think.
+ */
+	D_ASSERT(!test_bit(PARTNER_DISKLESS,&mdev->flags));
+	D_ASSERT(!test_bit(DISKLESS,&mdev->flags));
+// EXPLAIN:
+	clear_bit(MD_IO_ALLOWED,&mdev->flags);
+
+	ok=TRUE;
+ out:
+	drbd_bm_unlock(mdev); // }
+	vfree(buffer);
+	return ok;
+}
+
+STATIC void drbd_fail_pending_reads(drbd_dev *mdev)
+{
+	struct list_head *le;
+	drbd_bio_t *bio;
+	LIST_HEAD(workset);
+
+	/*
+	 * Application READ requests
+	 */
+	spin_lock(&mdev->pr_lock);
+	list_splice_init(&mdev->app_reads,&workset);
+	spin_unlock(&mdev->pr_lock);
+
+	while(!list_empty(&workset)) {
+		drbd_request_t *req;
+		le = workset.next;
+		req = list_entry(le, drbd_request_t, w.list);
+		list_del(le);
+
+		bio = req->master_bio;
+
+		drbd_bio_IO_error(bio);
+		dec_ap_bio(mdev);
+		dec_ap_pending(mdev);
+
+		INVALIDATE_MAGIC(req);
+		mempool_free(req,drbd_request_mempool);
+	}
+}
+
+STATIC int receive_skip(drbd_dev *mdev,Drbd_Header *h)
+{
+	// TODO zero copy sink :)
+	static char sink[128];
+	int size,want,r;
+
+	WARN("skipping unknown optional packet type %d, l: %d!\n",
+	     h->command, h->length );
+
+	size = h->length;
+	while (size > 0) {
+		want = min_t(int,size,sizeof(sink));
+		r = drbd_recv(mdev,sink,want);
+		ERR_IF(r < 0) break;
+		size -= r;
+	}
+	return (size == 0);
+}
+
+STATIC int receive_BecomeSyncTarget(drbd_dev *mdev, Drbd_Header *h)
+{
+	ERR_IF(!mdev->bitmap) return FALSE;
+	ERR_IF(mdev->state != Secondary)
+		return FALSE;
+	ERR_IF(mdev->cstate != Connected)
+		return FALSE;
+	ERR_IF(test_bit(DISKLESS,&mdev->flags))
+		return FALSE;
+
+	drbd_bm_lock(mdev);
+	drbd_bm_set_all(mdev);
+	drbd_bm_write(mdev);
+	drbd_start_resync(mdev,SyncTarget);
+	drbd_bm_unlock(mdev);
+	return TRUE;
+}
+
+STATIC int receive_BecomeSyncSource(drbd_dev *mdev, Drbd_Header *h)
+{
+	ERR_IF(mdev->cstate != Connected)
+		return FALSE;
+	ERR_IF(test_bit(DISKLESS,&mdev->flags))
+		return FALSE;
+	ERR_IF(!drbd_md_test_flag(mdev,MDF_Consistent))
+		return FALSE;
+
+	drbd_bm_lock(mdev);
+	drbd_bm_set_all(mdev);
+	drbd_bm_write(mdev);
+	drbd_start_resync(mdev,SyncSource);
+	drbd_bm_unlock(mdev);
+	return TRUE;
+}
+
+STATIC int receive_UnplugRemote(drbd_dev *mdev, Drbd_Header *h)
+{
+	if (!test_bit(DISKLESS,&mdev->flags)) drbd_kick_lo(mdev);
+	return TRUE; // cannot fail.
+}
+
+typedef int (*drbd_cmd_handler_f)(drbd_dev*,Drbd_Header*);
+
+static drbd_cmd_handler_f drbd_default_handler[] = {
+	[Data]             = receive_Data,
+	[DataReply]        = receive_DataReply,
+	[RSDataReply]      = receive_RSDataReply,
+	[RecvAck]          = NULL, //receive_RecvAck,
+	[WriteAck]         = NULL, //receive_WriteAck,
+	[Barrier]          = receive_Barrier,
+	[BarrierAck]       = NULL, //receive_BarrierAck,
+	[ReportParams]     = receive_param,
+	[ReportBitMap]     = receive_bitmap,
+	[Ping]             = NULL, //receive_Ping,
+	[PingAck]          = NULL, //receive_PingAck,
+	[BecomeSyncTarget] = receive_BecomeSyncTarget,
+	[BecomeSyncSource] = receive_BecomeSyncSource,
+	[UnplugRemote]     = receive_UnplugRemote,
+	[DataRequest]      = receive_DataRequest,
+	[RSDataRequest]    = receive_DataRequest, //receive_RSDataRequest,
+	[SyncParam]        = receive_SyncParam,
+};
+
+static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
+static drbd_cmd_handler_f *drbd_opt_cmd_handler = NULL;
+
+STATIC void drbdd(drbd_dev *mdev)
+{
+	drbd_cmd_handler_f handler;
+	Drbd_Header *header = &mdev->data.rbuf.head;
+
+	for (;;) {
+		if (!drbd_recv_header(mdev,header))
+			break;
+
+		if (header->command < MAX_CMD)
+			handler = drbd_cmd_handler[header->command];
+		else if (MayIgnore < header->command && header->command < MAX_OPT_CMD)
+			handler = drbd_opt_cmd_handler[header->command-MayIgnore];
+		else if (header->command > MAX_OPT_CMD)
+			handler = receive_skip;
+		else
+			handler = NULL;
+
+		if (unlikely(!handler)) {
+			ERR("unknown packet type %d, l: %d!\n",
+			    header->command, header->length);
+			break;
+		}
+		if (mdev->cstate == WFReportParams && header->command != ReportParams) {
+			ERR("received %s packet while WFReportParams!?\n",
+					cmdname(header->command));
+		}
+		if (unlikely(!handler(mdev,header))) {
+			ERR("error receiving %s, l: %d!\n",
+			    cmdname(header->command), header->length);
+			break;
+		}
+		dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__);
+	}
+}
+
+STATIC void drbd_disconnect(drbd_dev *mdev)
+{
+	D_ASSERT(mdev->cstate < Connected);
+	mdev->o_state = Unknown;
+
+	/* in case we have been syncing, and then we drop the connection,
+	 * we need to "w_resume_next_sg", which we try to achieve by
+	 * setting the STOP_SYNC_TIMER bit, and schedulung the timer for
+	 * immediate execution.
+	 * unfortunately we cannot be sure that the timer already triggered.
+	 *
+	 * so we del_timer_sync here, and check that bit.
+	 * if it is still set, we queue w_resume_next_sg anyways,
+	 * just to be sure.
+	 */
+
+	del_timer_sync(&mdev->resync_timer);
+	spin_lock_irq(&mdev->req_lock);
+	if (test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags)) {
+		mdev->resync_work.cb = w_resume_next_sg;
+		if (list_empty(&mdev->resync_work.list))
+			_drbd_queue_work(&mdev->data.work,&mdev->resync_work);
+		// else: already queued, we only need to release the lock.
+	} else {
+		D_ASSERT(mdev->resync_work.cb == w_resync_inactive);
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+
+	drbd_thread_stop_nowait(&mdev->worker);
+	drbd_thread_stop(&mdev->asender);
+
+	while(down_trylock(&mdev->data.mutex)) {
+		struct task_struct *task;
+		spin_lock(&mdev->send_task_lock);
+		if((task=mdev->send_task)) {
+			force_sig(DRBD_SIG, task);
+			spin_unlock(&mdev->send_task_lock);
+			down(&mdev->data.mutex);
+			break;
+		} else {
+			spin_unlock(&mdev->send_task_lock);
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(HZ / 10);
+		}
+	}
+	/* By grabbing the sock_mutex we make sure that no one
+	   uses the socket right now. */
+	drbd_free_sock(mdev);
+	up(&mdev->data.mutex);
+
+	drbd_fail_pending_reads(mdev);
+	drbd_thread_stop(&mdev->worker);
+	drbd_rs_cancel_all(mdev);
+
+	// secondary
+	drbd_wait_ee(mdev,&mdev->active_ee);
+	drbd_wait_ee(mdev,&mdev->sync_ee);
+	drbd_clear_done_ee(mdev);
+
+	// primary
+	tl_clear(mdev);
+	clear_bit(ISSUE_BARRIER,&mdev->flags);
+	wait_event( mdev->cstate_wait, atomic_read(&mdev->ap_pending_cnt)==0 );
+	D_ASSERT(mdev->oldest_barrier->n_req == 0);
+
+	// both
+	clear_bit(PARTNER_CONSISTENT, &mdev->flags);
+	clear_bit(PARTNER_DISKLESS,&mdev->flags);
+
+	D_ASSERT(mdev->ee_in_use == 0);
+	D_ASSERT(list_empty(&mdev->read_ee)); // done by termination of worker
+	D_ASSERT(list_empty(&mdev->active_ee)); // done here
+	D_ASSERT(list_empty(&mdev->sync_ee)); // done here
+	D_ASSERT(list_empty(&mdev->done_ee)); // done here
+
+	atomic_set(&mdev->epoch_size,0);
+	mdev->rs_total=0;
+
+	if(atomic_read(&mdev->unacked_cnt)) {
+		ERR("unacked_cnt = %d\n",atomic_read(&mdev->unacked_cnt));
+		atomic_set(&mdev->unacked_cnt,0);
+	}
+
+	/* We do not have data structures that would allow us to 
+	   get the rs_pending_cnt down to 0 again.
+	   * On SyncTarget we do not have any data structures describing 
+	     the pending RSDataRequest's we have sent.
+	   * On SyncSource there is no data structure that tracks
+	     the RSDataReply blocks that we sent to the SyncTarget.
+	   And no, it is not the sum of the reference counts in the 
+	   resync_LRU. The resync_LRU tracks the whole operation including
+           the disk-IO, while the rs_pending_cnt only tracks the blocks 
+	   on the fly. */
+	atomic_set(&mdev->rs_pending_cnt,0);
+
+	if(atomic_read(&mdev->ap_pending_cnt)) {
+		ERR("ap_pending_cnt = %d\n",atomic_read(&mdev->ap_pending_cnt));
+		atomic_set(&mdev->ap_pending_cnt,0);
+	}
+
+	wake_up(&mdev->cstate_wait);
+
+	if ( mdev->state == Primary &&
+	    ( test_bit(DISKLESS,&mdev->flags)
+	    || !drbd_md_test_flag(mdev,MDF_Consistent) ) ) {
+		drbd_thread_stop_nowait(&mdev->receiver);
+		drbd_panic("Sorry, I have no access to good data anymore.\n");
+		return;
+	}
+
+	if (get_t_state(&mdev->receiver) == Exiting) {
+		if (test_bit(DISKLESS,&mdev->flags)) {
+			// Secondary
+			set_cstate(mdev,Unconfigured);
+			drbd_mdev_cleanup(mdev);
+		} else {
+			set_cstate(mdev,StandAlone);
+			drbd_thread_start(&mdev->worker);
+		}
+	} else {
+		set_cstate(mdev,Unconnected);
+		drbd_thread_start(&mdev->worker);
+	}
+
+	if (mdev->state == Primary) {
+		if(!test_bit(DO_NOT_INC_CONCNT,&mdev->flags))
+			drbd_md_inc(mdev,ConnectedCnt);
+		drbd_md_write(mdev);
+	}
+	clear_bit(DO_NOT_INC_CONCNT,&mdev->flags);
+
+	/* it may still be set, because some unplug was on the fly */
+	NOT_IN_26(mdev->flags &= ~(1<<UNPLUG_QUEUED);)
+
+	INFO("Connection lost.\n");
+}
+
+/*
+ * we hereby assure that we always support the drbd dialects
+ * PRO_VERSION and (PRO_VERSION -1), allowing for rolling upgrades
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+int drbd_send_handshake(drbd_dev *mdev)
+{
+	// ASSERT current == mdev->receiver ...
+	Drbd_HandShake_Packet *p = &mdev->data.sbuf.HandShake;
+	int ok;
+
+	if (down_interruptible(&mdev->data.mutex)) {
+		ERR("interrupted during initial handshake\n");
+		return 0; /* interrupted. not ok. */
+	}
+	memset(p,0,sizeof(*p));
+	p->protocol_version = cpu_to_be32(PRO_VERSION);
+	ok = _drbd_send_cmd( mdev, mdev->data.socket, HandShake,
+	                     (Drbd_Header *)p, sizeof(*p), 0 );
+	up(&mdev->data.mutex);
+	return ok;
+}
+
+/*
+ * return values:
+ *   1 yess, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+STATIC int drbd_do_handshake(drbd_dev *mdev)
+{
+	// ASSERT current == mdev->receiver ...
+	Drbd_HandShake_Packet *p = &mdev->data.rbuf.HandShake;
+	const int expect = sizeof(Drbd_HandShake_Packet)-sizeof(Drbd_Header);
+	int rv;
+
+	rv = drbd_send_handshake(mdev);
+	if (!rv) return 0;
+
+	rv = drbd_recv_header(mdev,&p->head);
+	if (!rv) return 0;
+
+	if (p->head.command == ReportParams) {
+		ERR("expected HandShake packet, received ReportParams...\n");
+		ERR("peer probaly runs some incompatible 0.7 -preX version\n");
+		return -1;
+	} else if (p->head.command != HandShake) {
+		ERR( "expected HandShake packet, received: %s (0x%04x)\n",
+		     cmdname(p->head.command), p->head.command );
+		return -1;
+	}
+
+	if (p->head.length != expect) {
+		ERR( "expected HandShake length: %u, received: %u\n",
+		     expect, p->head.length );
+		return -1;
+	}
+
+	rv = drbd_recv(mdev, &p->head.payload, expect);
+
+	if (rv != expect) {
+		ERR("short read receiving handshake packet: l=%u\n", rv);
+		return 0;
+	}
+
+	dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__);
+
+	p->protocol_version = be32_to_cpu(p->protocol_version);
+
+	if ( p->protocol_version == PRO_VERSION ||
+	     p->protocol_version == (PRO_VERSION+1) ) {
+		if (p->protocol_version == (PRO_VERSION+1)) {
+			WARN( "You should upgrade me! "
+			      "Peer wants protocol version: %u\n",
+			      p->protocol_version );
+		}
+		INFO( "Handshake successful: DRBD Network Protocol version %u\n",
+		      PRO_VERSION );
+	} /* else if ( p->protocol_version == (PRO_VERSION-1) ) {
+		// not yet; but next time :)
+		INFO( "Handshake successful: DRBD Protocol version %u\n",
+		      (PRO_VERSION-1) );
+		... do some remapping of defaults and jump tables here ...
+	} */ else {
+		ERR( "incompatible DRBD dialects: "
+		     "I support %u, peer wants %u\n",
+		     PRO_VERSION, p->protocol_version );
+		return -1;
+	}
+
+	return 1;
+}
+
+int drbdd_init(struct Drbd_thread *thi)
+{
+	drbd_dev *mdev = thi->mdev;
+	int minor = (int)(mdev-drbd_conf);
+	int h;
+
+	sprintf(current->comm, "drbd%d_receiver", minor);
+
+	/* printk(KERN_INFO DEVICE_NAME ": receiver living/m=%d\n", minor); */
+
+	while (TRUE) {
+		h = drbd_connect(mdev);
+		if (h <= 0) {
+			/* FIXME DISKLESS StandAlone
+			 * does not make much sense...
+			 * drbd_disconnect should set cstate properly...
+			 */
+			drbd_disconnect(mdev);
+			if (h == 0) {
+				schedule_timeout(HZ);
+				continue;
+			}
+
+			WARN("Discarding network configuration.\n");
+			set_cstate(mdev,StandAlone);
+			break;
+		}
+		if (get_t_state(thi) == Exiting) break;
+		drbdd(mdev);
+		drbd_disconnect(mdev);
+		if (get_t_state(thi) == Exiting) break;
+		if(mdev->conf.on_disconnect == DropNetConf) {
+			set_cstate(mdev,StandAlone);
+			break;
+		}
+		else {
+			if (signal_pending(current)) {
+				drbd_flush_signals(current);
+			}
+			spin_lock(&thi->t_lock);
+			D_ASSERT(thi->t_state == Restarting);
+			thi->t_state = Running;
+			spin_unlock(&thi->t_lock);
+		}
+	}
+
+	INFO("receiver terminated\n");
+
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+STATIC int got_Ping(drbd_dev *mdev, Drbd_Header* h)
+{
+	return drbd_send_ping_ack(mdev);
+
+}
+
+STATIC int got_PingAck(drbd_dev *mdev, Drbd_Header* h)
+{
+	// restore idle timeout
+	mdev->meta.socket->sk->SK_(rcvtimeo) = mdev->conf.ping_int*HZ;
+
+	return TRUE;
+}
+
+STATIC int got_BlockAck(drbd_dev *mdev, Drbd_Header* h)
+{
+	drbd_request_t *req;
+	Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	smp_rmb();
+	if(likely(!test_bit(PARTNER_DISKLESS,&mdev->flags))) {
+		// test_bit(PARTNER_DISKLESS,&mdev->flags)
+		// This happens if one a few IO requests on the peer
+		// failed, and some subsequest completed sucessfull
+		// afterwards.
+
+		// But we killed everything out of the transferlog
+		// as we got the news hat IO is broken on the peer.
+
+		if( is_syncer_blk(mdev,p->block_id)) {
+			drbd_set_in_sync(mdev,sector,blksize);
+			set_bit(SYNC_STARTED,&mdev->flags);
+		} else {
+			req=(drbd_request_t*)(long)p->block_id;
+
+			ERR_IF (!VALID_POINTER(req)) return FALSE;
+
+			drbd_end_req(req, RQ_DRBD_SENT, 1, sector);
+
+			if (test_bit(SYNC_STARTED,&mdev->flags) &&
+			    mdev->conf.wire_protocol == DRBD_PROT_C)
+				drbd_set_in_sync(mdev,sector,blksize);
+		}
+	}
+
+	if(is_syncer_blk(mdev,p->block_id)) {
+		dec_rs_pending(mdev);
+	} else {
+		D_ASSERT(mdev->conf.wire_protocol != DRBD_PROT_A);
+		dec_ap_pending(mdev);
+	}
+	return TRUE;
+}
+
+STATIC int got_NegAck(drbd_dev *mdev, Drbd_Header* h)
+{
+	Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h;
+#if 0
+	sector_t sector = be64_to_cpu(p->sector);
+	int size = be32_to_cpu(p->blksize);
+#endif
+
+	/* do nothing here.
+	 * we expect to get a "report param" on the data socket soon,
+	 * and will do the cleanup then and there.
+	 */
+	if(is_syncer_blk(mdev,p->block_id)) {
+		dec_rs_pending(mdev);
+	}
+#if 0
+	else {
+		D_ASSERT(bm_get_bit(mdev->mbds_id,sector,size));
+		// tl_clear() must have set this out of sync!
+		D_ASSERT(mdev->conf.wire_protocol != DRBD_PROT_A);
+		dec_ap_pending(mdev,HERE);
+	}
+#endif
+	if (DRBD_ratelimit(5*HZ,5))
+		WARN("Got NegAck packet. Peer is in troubles?\n");
+
+	return TRUE;
+}
+
+STATIC int got_NegDReply(drbd_dev *mdev, Drbd_Header* h)
+{
+	/* drbd_request_t *req;
+	 * unused now */
+	Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h;
+
+	if (is_syncer_blk(mdev,p->block_id)) {
+		/* no resync data available. don't panic just yet ... */
+		printk(KERN_EMERG DEVICE_NAME "%d: "
+		       "Got NegDReply for resync request. "
+		       "WE ARE LOST. We lost our up-to-date disk.\n",
+			(int)(mdev-drbd_conf));
+		return FALSE;
+	} /* else { */
+
+#if 0
+	/* hey, we panic anyways. so why bother? */
+	req = (drbd_request_t *)(long)p->block_id;
+	if (VALID_POINTER(req)) {
+		D_ASSERT(req->w.cb == w_is_app_read);
+
+		spin_lock(&mdev->pr_lock);
+		list_del(&req->w.list);
+		spin_unlock(&mdev->pr_lock);
+
+		INVALIDATE_MAGIC(req);
+		mempool_free(req,drbd_request_mempool);
+	}
+#endif
+
+	drbd_panic("Got NegDReply. WE ARE LOST. We lost our up-to-date disk.\n");
+
+	// THINK do we have other options, but panic?
+	//       what about bio_endio, in case we don't panic ??
+
+	return FALSE;
+}
+
+STATIC int got_NegRSDReply(drbd_dev *mdev, Drbd_Header* h)
+{
+	sector_t sector;
+	Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(p->block_id == ID_SYNCER);
+
+	drbd_rs_complete_io(mdev,sector);
+
+	drbd_panic("Got NegRSDReply. WE ARE LOST. We lost our up-to-date disk.\n");
+
+	// THINK do we have other options, but panic?
+	//       what about bio_endio, in case we don't panic ??
+
+	return TRUE;
+}
+
+STATIC int got_BarrierAck(drbd_dev *mdev, Drbd_Header* h)
+{
+	Drbd_BarrierAck_Packet *p = (Drbd_BarrierAck_Packet*)h;
+
+	smp_rmb();
+	if(unlikely(test_bit(PARTNER_DISKLESS,&mdev->flags))) return TRUE;
+
+	tl_release(mdev,p->barrier,be32_to_cpu(p->set_size));
+	dec_ap_pending(mdev);
+
+	return TRUE;
+}
+
+struct asender_cmd {
+	size_t pkt_size;
+	int (*process)(drbd_dev *mdev, Drbd_Header* h);
+};
+
+int drbd_asender(struct Drbd_thread *thi)
+{
+	drbd_dev *mdev = thi->mdev;
+	Drbd_Header *h = &mdev->meta.rbuf.head;
+
+	int rv,len;
+	void *buf    = h;
+	int received = 0;
+	int expect   = sizeof(Drbd_Header);
+	int cmd      = -1;
+
+	static struct asender_cmd asender_tbl[] = {
+		[Ping]      ={ sizeof(Drbd_Header),           got_Ping },
+		[PingAck]   ={ sizeof(Drbd_Header),           got_PingAck },
+		[RecvAck]   ={ sizeof(Drbd_BlockAck_Packet),  got_BlockAck },
+		[WriteAck]  ={ sizeof(Drbd_BlockAck_Packet),  got_BlockAck },
+		[NegAck]    ={ sizeof(Drbd_BlockAck_Packet),  got_NegAck },
+		[NegDReply] ={ sizeof(Drbd_BlockAck_Packet),  got_NegDReply },
+		[NegRSDReply]={sizeof(Drbd_BlockAck_Packet),  got_NegRSDReply},
+		[BarrierAck]={ sizeof(Drbd_BarrierAck_Packet),got_BarrierAck },
+	};
+
+	sprintf(current->comm, "drbd%d_asender", (int)(mdev-drbd_conf));
+
+	current->policy = SCHED_RR;  /* Make this a realtime task! */
+	current->rt_priority = 2;    /* more important than all other tasks */
+
+	while (get_t_state(thi) == Running) {
+		if (test_and_clear_bit(SEND_PING, &mdev->flags)) {
+			ERR_IF(!drbd_send_ping(mdev)) goto err;
+			// half ack timeout only,
+			// since sendmsg waited the other half already
+			mdev->meta.socket->sk->SK_(rcvtimeo) =
+				mdev->conf.timeout*HZ/20;
+		}
+
+		/* FIXME this *should* be below drbd_process_ee,
+		 * but that leads to some distributed deadlock :-(
+		 * this needs to be fixed properly, I'd vote for a separate
+		 * msock sender thread, but others will frown upon yet an other
+		 * kernel thread...
+		 *	-- lge
+		 */
+		set_bit(SIGNAL_ASENDER, &mdev->flags);
+
+		if (!drbd_process_ee(mdev,0)) goto err;
+
+		rv = drbd_recv_short(mdev,buf,expect-received);
+		clear_bit(SIGNAL_ASENDER, &mdev->flags);
+
+		drbd_flush_signals(current);
+
+		/* Note:
+		 * -EINTR        (on meta) we got a signal
+		 * -EAGAIN       (on meta) rcvtimeo expired
+		 * -ECONNRESET   other side closed the connection
+		 * -ERESTARTSYS  (on data) we got a signal
+		 * rv <  0       other than above: unexpected error!
+		 * rv == expected: full header or command
+		 * rv <  expected: "woken" by signal during receive
+		 * rv == 0       : "connection shut down by peer"
+		 */
+		if (likely(rv > 0)) {
+			received += rv;
+			buf      += rv;
+		} else if (rv == 0) {
+			ERR("meta connection shut down by peer.\n");
+			goto err;
+		} else if (rv == -EAGAIN) {
+			if( mdev->meta.socket->sk->SK_(rcvtimeo) ==
+			    mdev->conf.timeout*HZ/20) {
+				ERR("PingAck did not arrive in time.\n");
+				goto err;
+			}
+			set_bit(SEND_PING,&mdev->flags);
+			continue;
+		} else if (rv == -EINTR) {
+			continue;
+		} else {
+			ERR("sock_recvmsg returned %d\n", rv);
+			goto err;
+		}
+
+		if (received == expect && cmd == -1 ) {
+			cmd = be16_to_cpu(h->command);
+			len = be16_to_cpu(h->length);
+			if (unlikely( h->magic != BE_DRBD_MAGIC )) {
+				ERR("magic?? m: 0x%lx c: %d l: %d\n",
+				    (long)be32_to_cpu(h->magic),
+				    h->command, h->length);
+				goto err;
+			}
+			expect = asender_tbl[cmd].pkt_size;
+			ERR_IF(len != expect-sizeof(Drbd_Header)) {
+				dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__);
+				DUMPI(expect);
+			}
+		}
+		if(received == expect) {
+			D_ASSERT(cmd != -1);
+			dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__);
+			if(!asender_tbl[cmd].process(mdev,h)) goto err;
+
+			buf      = h;
+			received = 0;
+			expect   = sizeof(Drbd_Header);
+			cmd      = -1;
+		}
+	} //while
+
+	if(0) {
+	err:
+		clear_bit(SIGNAL_ASENDER, &mdev->flags);
+		if (mdev->cstate >= Connected)
+			set_cstate(mdev,NetworkFailure);
+		drbd_thread_restart_nowait(&mdev->receiver);
+	}
+
+	INFO("asender terminated\n");
+
+	return 0;
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_req.c	2005-08-16 16:32:42.000000000 +0400
@@ -0,0 +1,425 @@
+/*
+-*- linux-c -*-
+   drbd_req.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 1999-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+	main author.
+
+   Copyright (C) 2002-2004, Lars Ellenberg <l.g.e@web.de>.
+	main contributor.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+void drbd_end_req(drbd_request_t *req, int nextstate, int er_flags,
+		  sector_t rsector)
+{
+	/* This callback will be called in irq context by the IDE drivers,
+	   and in Softirqs/Tasklets/BH context by the SCSI drivers.
+	   This function is called by the receiver in kernel-thread context.
+	   Try to get the locking right :) */
+
+	struct Drbd_Conf* mdev = drbd_req_get_mdev(req);
+	unsigned long flags=0;
+	int uptodate;
+
+	PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev));
+	PARANOIA_BUG_ON(drbd_req_get_sector(req) != rsector);
+	spin_lock_irqsave(&mdev->req_lock,flags);
+
+	if(req->rq_status & nextstate) {
+		ERR("request state error(%d)\n", req->rq_status);
+	}
+
+	req->rq_status |= nextstate;
+	req->rq_status &= er_flags | ~0x0001;
+	if( (req->rq_status & RQ_DRBD_DONE) == RQ_DRBD_DONE ) goto end_it;
+
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+
+	return;
+
+/* We only report uptodate == TRUE if both operations (WRITE && SEND)
+   reported uptodate == TRUE
+ */
+
+	end_it:
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+
+	if( req->rq_status & RQ_DRBD_IN_TL ) {
+		if( ! ( er_flags & ERF_NOTLD ) ) {
+			/*If this call is from tl_clear() we may not call 
+			  tl_dependene, otherwhise we have a homegrown 
+			  spinlock deadlock.   */
+			if(tl_dependence(mdev,req))
+				set_bit(ISSUE_BARRIER,&mdev->flags);
+		} else {
+			list_del(&req->w.list); // we have the tl_lock...
+		}
+	}
+
+	uptodate = req->rq_status & 0x0001;
+	if( !uptodate && mdev->on_io_error == Detach) {
+		drbd_set_out_of_sync(mdev,rsector, drbd_req_get_size(req));
+		// It should also be as out of sync on
+		// the other side!  See w_io_error()
+
+		drbd_bio_endio(req->master_bio,1);
+		dec_ap_bio(mdev);
+		// The assumption is that we wrote it on the peer.
+
+// FIXME proto A and diskless :)
+
+		req->w.cb = w_io_error;
+		drbd_queue_work(mdev,&mdev->data.work,&req->w);
+
+		goto out;
+
+	}
+
+	drbd_bio_endio(req->master_bio,uptodate);
+	dec_ap_bio(mdev);
+
+	INVALIDATE_MAGIC(req);
+	mempool_free(req,drbd_request_mempool);
+
+ out:
+	if (test_bit(ISSUE_BARRIER,&mdev->flags)) {
+		spin_lock_irqsave(&mdev->req_lock,flags);
+		if(list_empty(&mdev->barrier_work.list)) {
+			_drbd_queue_work(&mdev->data.work,&mdev->barrier_work);
+		}
+		spin_unlock_irqrestore(&mdev->req_lock,flags);
+	}
+}
+
+int drbd_read_remote(drbd_dev *mdev, drbd_request_t *req)
+{
+	int rv;
+	drbd_bio_t *bio = req->master_bio;
+
+	req->w.cb = w_is_app_read;
+	spin_lock(&mdev->pr_lock);
+	list_add(&req->w.list,&mdev->app_reads);
+	spin_unlock(&mdev->pr_lock);
+	set_bit(UNPLUG_REMOTE,&mdev->flags);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+	rv=drbd_send_drequest(mdev, DataRequest, bio->b_rsector, bio->b_size,
+			      (unsigned long)req);
+#else
+	rv=drbd_send_drequest(mdev, DataRequest, bio->bi_sector, bio->bi_size,
+			      (unsigned long)req);
+#endif
+	return rv;
+}
+
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be up to PAGE_SIZE, but BM_BLOCK_SIZE may be smaller
+ *   than PAGE_SIZE, we may need to check several bits.
+ */
+STATIC int drbd_may_do_local_read(drbd_dev *mdev, sector_t sector, int size)
+{
+	unsigned long sbnr,ebnr,bnr;
+	sector_t esector, nr_sectors;
+
+	if (drbd_md_test_flag(mdev,MDF_Consistent)) return 1;
+
+	nr_sectors = drbd_get_capacity(mdev->this_bdev);
+	esector = sector + (size>>9) -1;
+
+	D_ASSERT(sector  < nr_sectors);
+	D_ASSERT(esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	for (bnr = sbnr; bnr <= ebnr; bnr++) {
+		if (drbd_bm_test_bit(mdev,bnr)) return 0;
+	}
+	return 1;
+}
+
+STATIC int
+drbd_make_request_common(drbd_dev *mdev, int rw, int size,
+			 sector_t sector, drbd_bio_t *bio)
+{
+	drbd_request_t *req;
+	int local, remote;
+	int target_area_out_of_sync = FALSE; // only relevant for reads
+
+	if (unlikely(drbd_did_panic == DRBD_MAGIC)) {
+		drbd_bio_IO_error(bio);
+		return 0;
+	}
+
+	/*
+	 * If someone tries to mount on Secondary, and this is a 2.4 kernel,
+	 * it would lead to a readonly mounted, but not cache-coherent,
+	 * therefore dangerous, filesystem.
+	 * On 2.6 this is prevented by bd_claiming the device.
+	 * It is not that easy in 2.4.
+	 *
+	 * Because people continue to report they mount readonly, it does not
+	 * do what they expect, and their logs fill with messages and stuff.
+	 *
+	 * Since it just won't work, we just fail IO here.
+	 * [ ... until we implement some shared mode, and our users confirm by
+	 * configuration, that they handle cache coherency themselves ... ]
+	 */
+	if (mdev->state != Primary &&
+		( !disable_bd_claim || rw == WRITE ) ) {
+		if (DRBD_ratelimit(5*HZ,5)) {
+			ERR("Not in Primary state, no %s requests allowed\n",
+					disable_bd_claim ? "WRITE" : "IO");
+		}
+		drbd_bio_IO_error(bio);
+		return 0;
+	}
+
+	/*
+	 * Paranoia: we might have been primary, but sync target, or
+	 * even diskless, then lost the connection.
+	 * This should have been handled (panic? suspend?) somehwere
+	 * else. But maybe it was not, so check again here.
+	 * Caution: as long as we do not have a read/write lock on mdev,
+	 * to serialize state changes, this is racy, since we may lose
+	 * the connection *after* we test for the cstate.
+	 */
+	if ( (    test_bit(DISKLESS,&mdev->flags)
+	      || !drbd_md_test_flag(mdev,MDF_Consistent)
+	     ) && mdev->cstate < Connected )
+	{
+		ERR("Sorry, I have no access to good data anymore.\n");
+/*
+	FIXME suspend, loop waiting on cstate wait? panic?
+*/
+		drbd_bio_IO_error(bio);
+		return 0;
+	}
+
+	/* allocate outside of all locks
+	 */
+	req = mempool_alloc(drbd_request_mempool, GFP_DRBD);
+	if (!req) {
+		/* only pass the error to the upper layers.
+		 * if user cannot handle io errors, thats not our business.
+		 */
+		ERR("could not kmalloc() req\n");
+		drbd_bio_IO_error(bio);
+		return 0;
+	}
+	SET_MAGIC(req);
+	req->master_bio = bio;
+
+	// XXX maybe merge both variants into one
+	if (rw == WRITE) drbd_req_prepare_write(mdev,req);
+	else             drbd_req_prepare_read(mdev,req);
+
+	/* XXX req->w.cb = something; drbd_queue_work() ....
+	 * Not yet.
+	 */
+
+	// down_read(mdev->device_lock);
+
+	wait_event( mdev->cstate_wait,
+		    (volatile int)(mdev->cstate < WFBitMapS || 
+				   mdev->cstate > WFBitMapT) );
+
+	local = inc_local(mdev);
+	NOT_IN_26( if (rw == READA) rw=READ );
+	if (rw == READ || rw == READA) {
+		if (local) {
+			if (!drbd_may_do_local_read(mdev,sector,size)) {
+				/* whe could kick the syncer to
+				 * sync this extent asap, wait for
+				 * it, then continue locally.
+				 * Or just issue the request remotely.
+				 */
+				/* FIXME
+				 * I think we have a RACE here. We request
+				 * something from the peer, then later some
+				 * write starts ...  and finished *before*
+				 * the answer to the read comes in, because
+				 * the ACK for the WRITE goes over
+				 * meta-socket ...
+				 * Maybe we need to properly lock reads
+				 * against the syncer, too. But if we have
+				 * some user issuing writes on an area that
+				 * he has pending reads on, _he_ is really
+				 * broke anyways, and would get "undefined
+				 * results" on _any_ io stack, even just the
+				 * local io stack.
+				 */
+				local = 0;
+				dec_local(mdev);
+			}
+		}
+		remote = !local && test_bit(PARTNER_CONSISTENT, &mdev->flags);
+	} else {
+		remote = 1;
+	}
+
+	/* If we have a disk, but a READA request is mapped to remote,
+	 * we are Primary, Inconsistent, SyncTarget.
+	 * Just fail that READA request right here.
+	 *
+	 * THINK: maybe fail all READA when not local?
+	 *        or make this configurable...
+	 *        if network is slow, READA won't do any good.
+	 */
+	if (rw == READA && !test_bit(DISKLESS,&mdev->flags) && !local) {
+		drbd_bio_IO_error(bio);
+		return 0;
+	}
+
+	if (rw == WRITE && local)
+		drbd_al_begin_io(mdev, sector);
+
+	remote = remote && (mdev->cstate >= Connected)
+			&& !test_bit(PARTNER_DISKLESS,&mdev->flags);
+
+	if (!(local || remote)) {
+		ERR("IO ERROR: neither local nor remote disk\n");
+		// FIXME PANIC ??
+		drbd_bio_IO_error(bio);
+		return 0;
+	}
+
+	/* do this first, so I do not need to call drbd_end_req,
+	 * but can set the rq_status directly.
+	 */
+	if (!local)
+		req->rq_status |= RQ_DRBD_LOCAL;
+	if (!remote)
+		req->rq_status |= RQ_DRBD_SENT;
+
+	/* we need to plug ALWAYS since we possibly need to kick lo_dev */
+	drbd_plug_device(mdev);
+
+	inc_ap_bio(mdev);
+	if (remote) {
+		/* either WRITE and Connected,
+		 * or READ, and no local disk,
+		 * or READ, but not in sync.
+		 */
+		inc_ap_pending(mdev);
+		if (rw == WRITE) {
+			if (!drbd_send_dblock(mdev,req)) {
+				if (mdev->cstate >= Connected)
+					set_cstate(mdev,NetworkFailure);
+				dec_ap_pending(mdev);
+				drbd_thread_restart_nowait(&mdev->receiver);
+			} else if(mdev->conf.wire_protocol == DRBD_PROT_A) {
+				dec_ap_pending(mdev);
+				drbd_end_req(req, RQ_DRBD_SENT, 1, sector);
+			}
+		} else if (target_area_out_of_sync) {
+			drbd_read_remote(mdev,req);
+		} else {
+			// this node is diskless ...
+			drbd_read_remote(mdev,req);
+		}
+	}
+
+	if (local) {
+		if (rw == WRITE) {
+			if (!remote) drbd_set_out_of_sync(mdev,sector,size);
+		} else {
+			D_ASSERT(!remote);
+		}
+		/* FIXME
+		 * Should we add even local reads to some list, so
+		 * they can be grabbed and freed somewhen?
+		 *
+		 * They already have a reference count (sort of...)
+		 * on mdev via inc_local()
+		 */
+		if(rw == WRITE) mdev->writ_cnt += size>>9;
+		else            mdev->read_cnt += size>>9;
+
+		// in 2.4.X, READA are submitted as READ.
+		drbd_generic_make_request(rw,drbd_req_private_bio(req));
+	}
+
+	// up_read(mdev->device_lock);
+	return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+int drbd_make_request_24(request_queue_t *q, int rw, struct buffer_head *bh)
+{
+	struct Drbd_Conf* mdev = drbd_conf + MINOR(bh->b_rdev);
+	if (MINOR(bh->b_rdev) >= minor_count || mdev->cstate < StandAlone) {
+		buffer_IO_error(bh);
+		return 0;
+	}
+
+	return drbd_make_request_common(mdev,rw,bh->b_size,bh->b_rsector,bh);
+}
+#else
+int drbd_make_request_26(request_queue_t *q, struct bio *bio)
+{
+	unsigned int s_enr,e_enr;
+	struct Drbd_Conf* mdev = (drbd_dev*) q->queuedata;
+	if (mdev->cstate < StandAlone) {
+		drbd_bio_IO_error(bio);
+		return 0;
+	}
+
+	/*
+	 * what we "blindly" assume:
+	 */
+	D_ASSERT(bio->bi_size > 0);
+	D_ASSERT( (bio->bi_size & 0x1ff) == 0);
+	D_ASSERT(bio->bi_size <= PAGE_SIZE);
+	D_ASSERT(bio->bi_vcnt == 1);
+	D_ASSERT(bio->bi_idx == 0);
+
+	s_enr = bio->bi_sector >> (AL_EXTENT_SIZE_B-9);
+	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> (AL_EXTENT_SIZE_B-9);
+	D_ASSERT(e_enr >= s_enr);
+
+	if(unlikely(s_enr != e_enr)) {
+		/* This bio crosses an AL_EXTENT boundary, so we have to
+		 * split it. [So far, only XFS is known to do this...]
+		 */
+		struct bio_pair *bp;
+		bp = bio_split(bio, bio_split_pool, 
+			       (e_enr<<(AL_EXTENT_SIZE_B-9)) - bio->bi_sector);
+		drbd_make_request_26(q,&bp->bio1);
+		drbd_make_request_26(q,&bp->bio2);
+		bio_pair_release(bp);
+		return 0;
+	}
+
+	return drbd_make_request_common(mdev,bio_rw(bio),bio->bi_size,
+					bio->bi_sector,bio);
+}
+#endif
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_sizeof_sanity_check.c	2005-10-17 18:32:53.000000000 +0400
@@ -0,0 +1,24 @@
+#include <linux/drbd.h>
+#include <linux/kernel.h>
+
+#define SZO(type,size) \
+	s = sizeof(type); \
+	if (s != size) { \
+		printk("<3>sizeof(" #type "): %d != %d\n", s, size); \
+		err = -1; \
+	}
+
+int sizeof_drbd_structs_sanity_check(void)
+{
+	int err = 0, s = 0;
+	SZO(struct disk_config,		 24)
+	SZO(struct net_config,		304)
+	SZO(struct syncer_config,	 24)
+	SZO(struct ioctl_disk_config,	 32)
+	SZO(struct ioctl_net_config,	312)
+	SZO(struct ioctl_syncer_config,	 32)
+	SZO(struct ioctl_wait,		 16)
+	SZO(struct ioctl_get_config,	440)
+	if (err) printk("<3>ioctls won't work, aborting\n");
+	return err;
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/drbd_worker.c	2005-09-22 13:31:37.000000000 +0400
@@ -0,0 +1,985 @@
+/*
+-*- linux-c -*-
+   drbd_worker.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 2003-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2004, Lars Ellenberg <l.g.e@web.de>.
+	authors.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/drbd_config.h>
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) || defined(HAVE_MM_INLINE_H)
+#include <linux/mm_inline.h> // for the page_count macro on RH/Fedora
+#endif
+#include <linux/slab.h>
+
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* I choose to have all block layer end_io handlers defined here.
+
+ * For all these callbacks, note the follwing:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by FIXME (I'd say worker only, but currently this is not true...)
+ */
+void drbd_md_io_complete(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate)
+		set_bit(BH_Uptodate, &bh->b_state);
+
+	complete((struct completion*)bh->b_private);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+void enslaved_read_bi_end_io(drbd_bio_t *bh, int uptodate)
+{
+	unsigned long flags=0;
+	struct Tl_epoch_entry *e=NULL;
+	struct Drbd_Conf* mdev;
+
+	mdev=bh->b_private;
+	PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev));
+
+	e = container_of(bh,struct Tl_epoch_entry,private_bio);
+	PARANOIA_BUG_ON(!VALID_POINTER(e));
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	spin_lock_irqsave(&mdev->ee_lock,flags);
+
+	mark_buffer_uptodate(bh, uptodate);
+	clear_bit(BH_Lock, &bh->b_state);
+	smp_mb__after_clear_bit();
+
+	list_del(&e->w.list);
+	if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait);
+	spin_unlock_irqrestore(&mdev->ee_lock,flags);
+
+	drbd_chk_io_error(mdev,!uptodate);
+	drbd_queue_work(mdev,&mdev->data.work,&e->w);
+	dec_local(mdev);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_dio_end_sec(struct buffer_head *bh, int uptodate)
+{
+	unsigned long flags=0;
+	struct Tl_epoch_entry *e=NULL;
+	struct Drbd_Conf* mdev;
+
+	mdev=bh->b_private;
+	PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev));
+
+	e = container_of(bh,struct Tl_epoch_entry,private_bio);
+	PARANOIA_BUG_ON(!VALID_POINTER(e));
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	spin_lock_irqsave(&mdev->ee_lock,flags);
+
+	mark_buffer_uptodate(bh, uptodate);
+
+	clear_bit(BH_Dirty, &bh->b_state);
+	clear_bit(BH_Lock, &bh->b_state);
+	smp_mb__after_clear_bit();
+
+	list_del(&e->w.list);
+	list_add_tail(&e->w.list,&mdev->done_ee);
+
+	if (waitqueue_active(&mdev->ee_wait) &&
+	    (list_empty(&mdev->active_ee) ||
+	     list_empty(&mdev->sync_ee)))
+		wake_up(&mdev->ee_wait);
+
+	spin_unlock_irqrestore(&mdev->ee_lock,flags);
+
+	drbd_chk_io_error(mdev,!uptodate);
+	wake_asender(mdev);
+	dec_local(mdev);
+}
+
+/* writes on Primary comming from drbd_make_request
+ */
+void drbd_dio_end(struct buffer_head *bh, int uptodate)
+{
+	struct Drbd_Conf* mdev;
+	drbd_request_t *req;
+
+	mdev = bh->b_private;
+	PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev));
+
+	req = container_of(bh,struct drbd_request,private_bio);
+	PARANOIA_BUG_ON(!VALID_POINTER(req));
+
+	drbd_chk_io_error(mdev,!uptodate);
+	drbd_end_req(req, RQ_DRBD_LOCAL, uptodate, drbd_req_get_sector(req));
+	drbd_al_complete_io(mdev,drbd_req_get_sector(req));
+	dec_local(mdev);
+}
+
+/* reads on Primary comming from drbd_make_request
+ */
+void drbd_read_bi_end_io(struct buffer_head *bh, int uptodate)
+{
+	struct Drbd_Conf* mdev;
+	drbd_request_t *req;
+
+	mdev = bh->b_private;
+	PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev));
+
+	req = container_of(bh,struct drbd_request,private_bio);
+	PARANOIA_BUG_ON(!VALID_POINTER(req));
+
+	// no special case for READA here, in 2.4.X we submit them as READ.
+	if (!uptodate) {
+		// for the panic:
+		drbd_chk_io_error(mdev,!uptodate); // handle panic and detach.
+		if(mdev->on_io_error == PassOn) goto pass_on;
+		// ok, if we survived this, retry:
+		// FIXME sector ...
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("local read failed, retrying remotely\n");
+		req->w.cb = w_read_retry_remote;
+		drbd_queue_work(mdev,&mdev->data.work,&req->w);
+	} else {
+	pass_on:
+		req->master_bio->b_end_io(req->master_bio,uptodate);
+		dec_ap_bio(mdev);
+
+		INVALIDATE_MAGIC(req);
+		mempool_free(req,drbd_request_mempool);
+	}
+	dec_local(mdev);
+}
+
+#else
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+int drbd_md_io_complete(struct bio *bio, unsigned int bytes_done, int error)
+{
+	if (bio->bi_size)
+		return 1;
+
+	complete((struct completion*)bio->bi_private);
+	return 0;
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+int enslaved_read_bi_end_io(struct bio *bio, unsigned int bytes_done, int error)
+{
+	unsigned long flags=0;
+	struct Tl_epoch_entry *e=NULL;
+	struct Drbd_Conf* mdev;
+
+	mdev=bio->bi_private;
+	PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev));
+
+	/* we should be called via bio_endio, so this should never be the case
+	 * but "everyone else does it", and so do we ;)		-lge
+	 */
+	ERR_IF (bio->bi_size)
+		return 1;
+
+	e = container_of(bio,struct Tl_epoch_entry,private_bio);
+	PARANOIA_BUG_ON(!VALID_POINTER(e));
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	spin_lock_irqsave(&mdev->ee_lock,flags);
+	list_del(&e->w.list);
+	if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait);
+	spin_unlock_irqrestore(&mdev->ee_lock,flags);
+
+	drbd_chk_io_error(mdev,error);
+	drbd_queue_work(mdev,&mdev->data.work,&e->w);
+	dec_local(mdev);
+	return 0;
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+int drbd_dio_end_sec(struct bio *bio, unsigned int bytes_done, int error)
+{
+	unsigned long flags=0;
+	struct Tl_epoch_entry *e=NULL;
+	struct Drbd_Conf* mdev;
+
+	mdev=bio->bi_private;
+	PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev));
+
+	// see above
+	ERR_IF (bio->bi_size)
+		return 1;
+
+	e = container_of(bio,struct Tl_epoch_entry,private_bio);
+	PARANOIA_BUG_ON(!VALID_POINTER(e));
+	D_ASSERT(e->block_id != ID_VACANT);
+
+	spin_lock_irqsave(&mdev->ee_lock,flags);
+	list_del(&e->w.list);
+	list_add_tail(&e->w.list,&mdev->done_ee);
+
+	if (waitqueue_active(&mdev->ee_wait) &&
+	    (list_empty(&mdev->active_ee) ||
+	     list_empty(&mdev->sync_ee)))
+		wake_up(&mdev->ee_wait);
+
+	spin_unlock_irqrestore(&mdev->ee_lock,flags);
+
+	drbd_chk_io_error(mdev,error);
+	wake_asender(mdev);
+	dec_local(mdev);
+	return 0;
+}
+
+/* writes on Primary comming from drbd_make_request
+ */
+int drbd_dio_end(struct bio *bio, unsigned int bytes_done, int error)
+{
+	drbd_request_t *req=bio->bi_private;
+	struct Drbd_Conf* mdev=req->mdev;
+	sector_t rsector;
+
+	// see above
+	ERR_IF (bio->bi_size)
+		return 1;
+
+	drbd_chk_io_error(mdev,error);
+	rsector = drbd_req_get_sector(req);
+        // the bi_sector of the bio gets modified somewhere in drbd_end_req()!
+	drbd_end_req(req, RQ_DRBD_LOCAL, (error == 0), rsector);
+	drbd_al_complete_io(mdev,rsector);
+	dec_local(mdev);
+	bio_put(bio);
+	return 0;
+}
+
+/* reads on Primary comming from drbd_make_request
+ */
+int drbd_read_bi_end_io(struct bio *bio, unsigned int bytes_done, int error)
+{
+	drbd_request_t *req=bio->bi_private;
+	struct Drbd_Conf* mdev=req->mdev;
+
+	// see above
+	ERR_IF (bio->bi_size)
+		return 1;
+
+	/* READAs may fail.
+	 * upper layers need to be able to handle that themselves */
+	if (bio_rw(bio) == READA) goto pass_on;
+	if (error) {
+		drbd_chk_io_error(mdev,error); // handle panic and detach.
+		if(mdev->on_io_error == PassOn) goto pass_on;
+		// ok, if we survived this, retry:
+		// FIXME sector ...
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("local read failed, retrying remotely\n");
+		req->w.cb = w_read_retry_remote;
+		drbd_queue_work(mdev,&mdev->data.work,&req->w);
+	} else {
+	pass_on:
+		bio_endio(req->master_bio,req->master_bio->bi_size,error);
+		dec_ap_bio(mdev);
+
+		INVALIDATE_MAGIC(req);
+		mempool_free(req,drbd_request_mempool);
+	}
+
+	bio_put(bio);
+	dec_local(mdev);
+	return 0;
+}
+#endif
+
+int w_io_error(drbd_dev* mdev, struct drbd_work* w,int cancel)
+{
+	drbd_request_t *req = (drbd_request_t*)w;
+	int ok;
+
+	/* FIXME send a "set_out_of_sync" packet to the peer
+	 * in the PassOn case...
+	 * in the Detach (or Panic) case, we (try to) send
+	 * a "we are diskless" param packet anyways, and the peer
+	 * will then set the FullSync bit in the meta data ...
+	 */
+	D_ASSERT(mdev->on_io_error != PassOn);
+
+	INVALIDATE_MAGIC(req);
+	mempool_free(req,drbd_request_mempool);
+
+	if(unlikely(cancel)) return 1;
+
+	ok = drbd_io_error(mdev);
+	if(unlikely(!ok)) ERR("Sending in w_io_error() failed\n");
+	return ok;
+}
+
+int w_read_retry_remote(drbd_dev* mdev, struct drbd_work* w,int cancel)
+{
+	drbd_request_t *req = (drbd_request_t*)w;
+	int ok;
+
+	smp_rmb();
+	if ( cancel ||
+	     mdev->cstate < Connected ||
+	     !test_bit(PARTNER_CONSISTENT,&mdev->flags) ) {
+		drbd_panic("WE ARE LOST. Local IO failure, no peer.\n");
+
+		// does not make much sense, but anyways...
+		drbd_bio_endio(req->master_bio,0);
+		dec_ap_bio(mdev);
+		mempool_free(req,drbd_request_mempool);
+		return 1;
+	}
+
+	// FIXME: what if partner was SyncTarget, and is out of sync for
+	// this area ?? ... should be handled in the receiver.
+
+	ok = drbd_io_error(mdev);
+	if(unlikely(!ok)) ERR("Sending in w_read_retry_remote() failed\n");
+	
+	inc_ap_pending(mdev);
+	ok = drbd_read_remote(mdev,req);
+	if(unlikely(!ok)) {
+		ERR("drbd_read_remote() failed\n");
+		/* dec_ap_pending and bio_io_error are done in
+		 * drbd_fail_pending_reads
+		 */
+	}
+	return ok;
+}
+
+int w_resync_inactive(drbd_dev *mdev, struct drbd_work *w, int cancel)
+{
+	ERR_IF(cancel) return 1;
+	ERR("resync inactive, but callback triggered??\n");
+	return 0;
+}
+
+/* FIXME
+ * not used any longer, they now use e_end_resync_block.
+ * maybe remove again?
+ */
+int w_is_resync_read(drbd_dev *mdev, struct drbd_work *w, int unused)
+{
+	ERR("%s: Typecheck only, should never be called!\n", __FUNCTION__ );
+	return 0;
+}
+
+/* in case we need it. currently unused,
+ * since should be assigned to "w_read_retry_remote"
+ */
+int w_is_app_read(drbd_dev *mdev, struct drbd_work *w, int unused)
+{
+	ERR("%s: Typecheck only, should never be called!\n", __FUNCTION__ );
+	return 0;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+	unsigned long flags;
+	drbd_dev* mdev = (drbd_dev*) data;
+
+	spin_lock_irqsave(&mdev->req_lock,flags);
+
+	if(likely(!test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags))) {
+		mdev->resync_work.cb = w_make_resync_request;
+	} else {
+		mdev->resync_work.cb = w_resume_next_sg;
+	}
+
+	if(list_empty(&mdev->resync_work.list)) {
+		_drbd_queue_work(&mdev->data.work,&mdev->resync_work);
+	} else INFO("Avoided requeue of resync_work\n");
+
+	spin_unlock_irqrestore(&mdev->req_lock,flags);
+}
+
+#define SLEEP_TIME (HZ/10)
+
+int w_make_resync_request(drbd_dev* mdev, struct drbd_work* w,int cancel)
+{
+	unsigned long bit;
+	sector_t sector;
+	const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
+	int number,i,size;
+
+	PARANOIA_BUG_ON(w != &mdev->resync_work);
+
+	if(unlikely(cancel)) return 1;
+
+	if(unlikely(mdev->cstate < Connected)) {
+		ERR("Confused in w_make_resync_request()! cstate < Connected");
+		return 0;
+	}
+
+	if (mdev->cstate != SyncTarget) {
+		ERR("%s in w_make_resync_request\n", cstate_to_name(mdev->cstate));
+	}
+
+        number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+
+	if (atomic_read(&mdev->rs_pending_cnt)>number) {
+		goto requeue;
+	}
+	number -= atomic_read(&mdev->rs_pending_cnt);
+
+	for(i=0;i<number;i++) {
+
+	next_sector:
+		size = BM_BLOCK_SIZE;
+		bit  = drbd_bm_find_next(mdev);
+
+		if (bit == -1UL) {
+			/* FIXME either test_and_set some bit,
+			 * or make this the _only_ place that is allowed
+			 * to assign w_resync_inactive! */
+			mdev->resync_work.cb = w_resync_inactive;
+			return 1;
+		}
+
+		sector = BM_BIT_TO_SECT(bit);
+
+		if(!drbd_rs_begin_io(mdev,sector)) {
+			// we have been interrupted, probably connection lost!
+			D_ASSERT(signal_pending(current));
+			return 0;
+		}
+
+		if(unlikely( drbd_bm_test_bit(mdev,bit) == 0 )) {
+		      //INFO("Block got synced while in drbd_rs_begin_io()\n");
+			drbd_rs_complete_io(mdev,sector);
+			goto next_sector;
+		}
+
+		if (sector + (size>>9) > capacity) size = (capacity-sector)<<9;
+		inc_rs_pending(mdev);
+		if(!drbd_send_drequest(mdev,RSDataRequest,
+				       sector,size,ID_SYNCER)) {
+			ERR("drbd_send_drequest() failed, aborting...");
+			dec_rs_pending(mdev);
+			return 0; // FAILED. worker will abort!
+		}
+	}
+
+	if(drbd_bm_rs_done(mdev)) {
+		/* last syncer _request_ was sent,
+		 * but the RSDataReply not yet received.  sync will end (and
+		 * next sync group will resume), as soon as we receive the last
+		 * resync data block, and the last bit is cleared.
+		 * until then resync "work" is "inactive" ...
+		 */
+		mdev->resync_work.cb = w_resync_inactive;
+		return 1;
+	}
+
+ requeue:
+	mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
+	return 1;
+}
+
+int drbd_resync_finished(drbd_dev* mdev)
+{
+	unsigned long db,dt,dbdt;
+
+	dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
+	if (dt <= 0) dt=1;
+	db = mdev->rs_total;
+	dbdt = Bit2KB(db/dt);
+	mdev->rs_paused /= HZ;
+	INFO("Resync done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+	     dt + mdev->rs_paused, mdev->rs_paused, dbdt);
+
+	if (mdev->cstate == SyncTarget || mdev->cstate == PausedSyncT) {
+		drbd_md_set_flag(mdev,MDF_Consistent);
+		ERR_IF(drbd_md_test_flag(mdev,MDF_FullSync))
+			drbd_md_clear_flag(mdev,MDF_FullSync);
+		drbd_md_write(mdev);
+	} else if (mdev->cstate == SyncSource || mdev->cstate == PausedSyncS) {
+		set_bit(PARTNER_CONSISTENT, &mdev->flags);
+	} else {
+		ERR("unexpected cstate (%s) in drbd_resync_finished\n",
+		    cstate_to_name(mdev->cstate));
+	}
+
+	// assert that all bit-map parts are cleared.
+	D_ASSERT(list_empty(&mdev->resync->lru));
+	D_ASSERT(drbd_bm_total_weight(mdev) == 0);
+	mdev->rs_total  = 0;
+	mdev->rs_paused = 0;
+
+	set_cstate(mdev,Connected);
+
+	return 1;
+}
+
+int w_e_end_data_req(drbd_dev *mdev, struct drbd_work *w, int cancel)
+{
+	struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w;
+	int ok;
+
+	if(unlikely(cancel)) {
+		spin_lock_irq(&mdev->ee_lock);
+		drbd_put_ee(mdev,e);
+		spin_unlock_irq(&mdev->ee_lock);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	if(likely(drbd_bio_uptodate(&e->private_bio))) {
+		ok=drbd_send_block(mdev, DataReply, e);
+	} else {
+		ok=drbd_send_ack(mdev,NegDReply,e);
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("Sending NegDReply. I guess it gets messy.\n");
+		drbd_io_error(mdev);
+	}
+
+	dec_unacked(mdev);
+
+	spin_lock_irq(&mdev->ee_lock);
+	if( page_count(drbd_bio_get_page(&e->private_bio)) > 1 ) {
+		/* This might happen if sendpage() has not finished */
+		list_add_tail(&e->w.list,&mdev->net_ee);
+	} else {
+		drbd_put_ee(mdev,e);
+	}
+	spin_unlock_irq(&mdev->ee_lock);
+
+	if(unlikely(!ok)) ERR("drbd_send_block() failed\n");
+	return ok;
+}
+
+int w_e_end_rsdata_req(drbd_dev *mdev, struct drbd_work *w, int cancel)
+{
+	struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w;
+	int ok;
+
+	if(unlikely(cancel)) {
+		spin_lock_irq(&mdev->ee_lock);
+		drbd_put_ee(mdev,e);
+		spin_unlock_irq(&mdev->ee_lock);
+		dec_unacked(mdev);
+		return 1;
+	}
+
+	drbd_rs_complete_io(mdev,drbd_ee_get_sector(e));
+
+	if(likely(drbd_bio_uptodate(&e->private_bio))) {
+		if (likely( !test_bit(PARTNER_DISKLESS,&mdev->flags) )) {
+			inc_rs_pending(mdev);
+			ok=drbd_send_block(mdev, RSDataReply, e);
+		} else {
+			if (DRBD_ratelimit(5*HZ,5))
+				ERR("Not sending RSDataReply, partner DISKLESS!\n");
+			ok=1;
+		}
+	} else {
+		ok=drbd_send_ack(mdev,NegRSDReply,e);
+		if (DRBD_ratelimit(5*HZ,5))
+			ERR("Sending NegDReply. I guess it gets messy.\n");
+		drbd_io_error(mdev);
+	}
+
+	dec_unacked(mdev);
+
+	spin_lock_irq(&mdev->ee_lock);
+	if( page_count(drbd_bio_get_page(&e->private_bio)) > 1 ) {
+		/* This might happen if sendpage() has not finished */
+		list_add_tail(&e->w.list,&mdev->net_ee);
+	} else {
+		drbd_put_ee(mdev,e);
+	}
+	spin_unlock_irq(&mdev->ee_lock);
+
+	if(unlikely(!ok)) ERR("drbd_send_block() failed\n");
+	return ok;
+}
+
+int w_try_send_barrier(drbd_dev *mdev, struct drbd_work *w, int cancel)
+{
+	int ok=1;
+
+	if(unlikely(cancel)) return ok;
+
+	down(&mdev->data.mutex);
+	if(test_and_clear_bit(ISSUE_BARRIER,&mdev->flags)) {
+		ok = _drbd_send_barrier(mdev);
+	}
+	up(&mdev->data.mutex);
+
+	return ok;
+}
+
+int w_send_write_hint(drbd_dev *mdev, struct drbd_work *w, int cancel)
+{
+	if (cancel) return 1;
+	NOT_IN_26(clear_bit(UNPLUG_QUEUED,&mdev->flags));
+	return drbd_send_short_cmd(mdev,UnplugRemote);
+}
+
+STATIC void drbd_global_lock(void)
+{
+	int i;
+
+	local_irq_disable();
+	for (i=0; i < minor_count; i++) {
+		spin_lock(&drbd_conf[i].req_lock);
+	}
+}
+
+STATIC void drbd_global_unlock(void)
+{
+	int i;
+
+	for (i=0; i < minor_count; i++) {
+		spin_unlock(&drbd_conf[i].req_lock);
+	}
+	local_irq_enable();
+}
+
+STATIC void _drbd_rs_resume(drbd_dev *mdev)
+{
+	Drbd_CState ns;
+
+	ns = mdev->cstate - (PausedSyncS - SyncSource);
+	D_ASSERT(ns == SyncSource || ns == SyncTarget);
+
+	INFO("Syncer continues.\n");
+	mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
+	_set_cstate(mdev,ns);
+
+	if(mdev->cstate == SyncTarget) {
+		ERR_IF(test_bit(STOP_SYNC_TIMER,&mdev->flags)) {
+			unsigned long rs_left = drbd_bm_total_weight(mdev);
+			clear_bit(STOP_SYNC_TIMER,&mdev->flags);
+			if (rs_left == 0) {
+				INFO("rs_left==0 in _drbd_rs_resume\n");
+			} else {
+				ERR("STOP_SYNC_TIMER was set in "
+				    "_drbd_rs_resume, but rs_left still %lu\n",
+				    rs_left);
+			}
+		}
+		mod_timer(&mdev->resync_timer,jiffies);
+	}
+}
+
+
+STATIC void _drbd_rs_pause(drbd_dev *mdev)
+{
+	Drbd_CState ns;
+
+	D_ASSERT(mdev->cstate == SyncSource || mdev->cstate == SyncTarget);
+	ns = mdev->cstate + (PausedSyncS - SyncSource);
+
+	if(mdev->cstate == SyncTarget) set_bit(STOP_SYNC_TIMER,&mdev->flags);
+
+	mdev->rs_mark_time = jiffies;
+	// mdev->rs_mark_left = drbd_bm_total_weight(mdev); // I don't care...
+	_set_cstate(mdev,ns);
+	INFO("Syncer waits for sync group.\n");
+}
+
+STATIC int _drbd_pause_higher_sg(drbd_dev *mdev)
+{
+	drbd_dev *odev;
+	int i,rv=0;
+
+	for (i=0; i < minor_count; i++) {
+		odev = drbd_conf + i;
+		if ( odev->sync_conf.group > mdev->sync_conf.group
+		     && ( odev->cstate == SyncSource || 
+			  odev->cstate == SyncTarget ) ) {
+			_drbd_rs_pause(odev);
+			rv = 1;
+		}
+	}
+
+	return rv;
+}
+
+STATIC int _drbd_lower_sg_running(drbd_dev *mdev)
+{
+	drbd_dev *odev;
+	int i,rv=0;
+
+	for (i=0; i < minor_count; i++) {
+		odev = drbd_conf + i;
+		if ( odev->sync_conf.group < mdev->sync_conf.group
+		     && ( odev->cstate == SyncSource || 
+			  odev->cstate == SyncTarget ) ) {
+			rv = 1;
+		}
+	}
+
+	return rv;
+}
+
+STATIC int _drbd_resume_lower_sg(drbd_dev *mdev)
+{
+	drbd_dev *odev;
+	int i,rv=0;
+
+	for (i=0; i < minor_count; i++) {
+		odev = drbd_conf + i;
+		if ( odev->sync_conf.group < mdev->sync_conf.group
+		     && ( odev->cstate == PausedSyncS || 
+			  odev->cstate == PausedSyncT ) ) {
+			_drbd_rs_resume(odev);
+			rv = 1;
+		}
+	}
+
+	return rv;
+}
+
+int w_resume_next_sg(drbd_dev* mdev, struct drbd_work* w, int unused)
+{
+	drbd_dev *odev;
+	int i,ng=10000;
+
+	PARANOIA_BUG_ON(w != &mdev->resync_work);
+
+	drbd_global_lock();
+
+	for (i=0; i < minor_count; i++) {
+		odev = drbd_conf + i;
+		if ( odev->sync_conf.group <= mdev->sync_conf.group
+		     && ( odev->cstate == SyncSource || 
+			  odev->cstate == SyncTarget ) ) {
+			goto out; // Sync on an other device in this group
+			          // or a lower group still runs.
+		}
+	}
+
+	for (i=0; i < minor_count; i++) { // find next sync group
+		odev = drbd_conf + i;
+		if ( odev->sync_conf.group > mdev->sync_conf.group
+		     && odev->sync_conf.group < ng && 
+		     (odev->cstate==PausedSyncS || odev->cstate==PausedSyncT)){
+		  ng = odev->sync_conf.group;
+		}
+	}
+
+	for (i=0; i < minor_count; i++) { // resume all devices in next group
+		odev = drbd_conf + i;
+		if ( odev->sync_conf.group == ng &&
+		     (odev->cstate==PausedSyncS || odev->cstate==PausedSyncT)){
+			_drbd_rs_resume(odev);
+		}
+	}
+
+ out:
+	drbd_global_unlock();
+	w->cb = w_resync_inactive;
+
+	return 1;
+}
+
+void drbd_alter_sg(drbd_dev *mdev, int ng)
+{
+	int c = 0, p = 0;
+	int d = (ng - mdev->sync_conf.group);
+
+	drbd_global_lock();
+	mdev->sync_conf.group = ng;
+
+	if( ( mdev->cstate == PausedSyncS || 
+	      mdev->cstate == PausedSyncT ) && ( d < 0 ) ) {
+		if(_drbd_pause_higher_sg(mdev)) c=1;
+		else if(!_drbd_lower_sg_running(mdev)) c=1;
+		if(c) _drbd_rs_resume(mdev);
+	}
+
+	if( ( mdev->cstate == SyncSource || 
+	      mdev->cstate == SyncTarget ) && ( d > 0 ) ) {
+		if(_drbd_resume_lower_sg(mdev)) p=1;
+		else if(_drbd_lower_sg_running(mdev)) p=1;
+		if(p) _drbd_rs_pause(mdev);
+	}
+	drbd_global_unlock();
+}
+
+void drbd_start_resync(drbd_dev *mdev, Drbd_CState side)
+{
+	if(side == SyncTarget) {
+		drbd_md_clear_flag(mdev,MDF_Consistent);
+		drbd_bm_reset_find(mdev);
+	} else if (side == SyncSource) {
+		clear_bit(PARTNER_CONSISTENT, &mdev->flags);
+		/* If we are SyncSource we must be consistent.
+		 * FIXME this should be an assertion only,
+		 * otherwise it masks a logic bug somewhere else...
+		 */
+		ERR_IF (!drbd_md_test_flag(mdev,MDF_Consistent)) {
+			// FIXME this is actually a BUG()!
+			drbd_md_set_flag(mdev,MDF_Consistent);
+		}
+	} else {
+		ERR("Usage error in drbd_start_resync! (side == %s)\n",
+		     cstate_to_name(side));
+		return;
+	}
+	drbd_md_write(mdev);
+
+	set_cstate(mdev,side);
+	mdev->rs_total     =
+	mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+	mdev->rs_paused    = 0;
+	mdev->rs_start     =
+	mdev->rs_mark_time = jiffies;
+
+	INFO("Resync started as %s (need to sync %lu KB [%lu bits set]).\n",
+	     cstate_to_name(side),
+	     (unsigned long) mdev->rs_total << (BM_BLOCK_SIZE_B-10),
+	     (unsigned long) mdev->rs_total);
+
+	// FIXME: this was a PARANOIA_BUG_ON, but it triggered! ??
+	if (mdev->resync_work.cb != w_resync_inactive) {
+		if (mdev->resync_work.cb == w_make_resync_request)
+			ERR("resync_work.cb == w_make_resync_request, should be w_resync_inactive\n");
+		else if (mdev->resync_work.cb == w_resume_next_sg)
+			ERR("resync_work.cb == w_resume_next_sg, should be w_resync_inactive\n");
+		else
+			ERR("resync_work.cb == %p ???, should be w_resync_inactive\n",
+					mdev->resync_work.cb);
+		return;
+	}
+
+	if ( mdev->rs_total == 0 ) {
+		drbd_resync_finished(mdev);
+		return;
+	}
+
+	drbd_global_lock();
+	if (mdev->cstate == SyncTarget || mdev->cstate == SyncSource) {
+		_drbd_pause_higher_sg(mdev);
+		if(_drbd_lower_sg_running(mdev)) {
+			_drbd_rs_pause(mdev);
+		}
+	} /* else:
+	   * thread of other mdev already paused us,
+	   * or something very strange happend to our cstate!
+	   * I really hate it that we can't have a consistent view of cstate.
+	   */
+	drbd_global_unlock();
+
+	if (mdev->cstate == SyncTarget) {
+		D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags));
+		mod_timer(&mdev->resync_timer,jiffies);
+	} else if (mdev->cstate == PausedSyncT) { 
+		D_ASSERT(test_bit(STOP_SYNC_TIMER,&mdev->flags));
+		clear_bit(STOP_SYNC_TIMER,&mdev->flags);
+	}
+}
+
+int drbd_worker(struct Drbd_thread *thi)
+{
+	drbd_dev *mdev = thi->mdev;
+	struct drbd_work *w = 0;
+	LIST_HEAD(work_list);
+	int intr,i;
+
+	sprintf(current->comm, "drbd%d_worker", (int)(mdev-drbd_conf));
+
+	for (;;) {
+		intr = down_interruptible(&mdev->data.work.s);
+
+		if (unlikely(drbd_did_panic == DRBD_MAGIC)) {
+			drbd_suicide();
+		}
+
+		if (intr) {
+			D_ASSERT(intr == -EINTR);
+			drbd_flush_signals(current);
+			ERR_IF (get_t_state(thi) == Running)
+				continue;
+			break;
+		}
+
+		if (get_t_state(thi) != Running) break;
+		/* With this break, we have done an down() but not consumed
+		   the entry from the list. The cleanup code takes care of
+		   this...   */
+
+		w = 0;
+		spin_lock_irq(&mdev->req_lock);
+		D_ASSERT(!list_empty(&mdev->data.work.q));
+		w = list_entry(mdev->data.work.q.next,struct drbd_work,list);
+		list_del_init(&w->list);
+		spin_unlock_irq(&mdev->req_lock);
+
+		if(!w->cb(mdev,w, mdev->cstate < Connected )) {
+			//WARN("worker: a callback failed! \n");
+			if (mdev->cstate >= Connected)
+				set_cstate(mdev,NetworkFailure);
+			drbd_thread_restart_nowait(&mdev->receiver);
+		}
+	}
+
+	drbd_wait_ee(mdev,&mdev->read_ee);
+
+	i = 0;
+	spin_lock_irq(&mdev->req_lock);
+  again:
+	list_splice_init(&mdev->data.work.q,&work_list);
+	spin_unlock_irq(&mdev->req_lock);
+
+	while(!list_empty(&work_list)) {
+		w = list_entry(work_list.next, struct drbd_work,list);
+		list_del_init(&w->list);
+		w->cb(mdev,w,1);
+		i++;
+	}
+
+	spin_lock_irq(&mdev->req_lock);
+	ERR_IF(!list_empty(&mdev->data.work.q))
+		goto again;
+	sema_init(&mdev->data.work.s,0);
+	spin_unlock_irq(&mdev->req_lock);
+
+	INFO("worker terminated\n");
+
+	return 0;
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/hlist.h	2004-09-21 11:28:39.000000000 +0400
@@ -0,0 +1,129 @@
+#ifndef HLIST_HEAD_INIT
+#ifndef HLIST_H
+#define HLIST_H
+
+#ifdef REDHAT_HLIST_BACKPORT
+#undef hlist_node
+#undef hlist_head
+#undef HLIST_HEAD
+#undef INIT_HLIST_HEAD
+#undef hlist_empty
+#undef hlist_del_init
+#undef hlist_entry
+#undef hlist_add_head
+#undef hlist_for_each
+#undef hlist_for_each_safe
+#endif
+
+// from linux-2.6.x linux/list.h
+// I copied only the part which actually is used in lru_cache.h
+
+// ok, this is from linux/kernel.h
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ *
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
+
+static __inline__ int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static __inline__ int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static __inline__ void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;
+	if (next)
+		next->pprev = pprev;
+}
+
+#ifndef LIST_POISON1
+#define LIST_POISON1  ((void *) 0x00100100)
+#define LIST_POISON2  ((void *) 0x00200200)
+#endif
+
+static __inline__ void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->next = LIST_POISON1;
+	n->pprev = LIST_POISON2;
+}
+
+static __inline__ void hlist_del_init(struct hlist_node *n)
+{
+	if (n->pprev)  {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+/* Cannot easily do prefetch unfortunately */
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \
+	     pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
+	     pos = n)
+
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop counter.
+ * @pos:	the &struct hlist_node to use as a loop counter.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)			 \
+	for (pos = (head)->first;					 \
+	     pos && ({ prefetch(pos->next); 1;}) &&			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+#endif
+#endif
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/lru_cache.c	2005-04-05 16:08:31.000000000 +0400
@@ -0,0 +1,289 @@
+/*
+-*- linux-c -*-
+   lru_cache.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 2003-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2004, Lars Ellenberg <l.g.e@web.de>.
+        authors.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+#include <linux/compiler.h> // for likely()
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h> // for memset
+#include "lru_cache.h"
+
+#define STATIC static
+
+// this is developers aid only!
+#define PARANOIA_ENTRY() BUG_ON(test_and_set_bit(__LC_PARANOIA,&lc->flags))
+#define PARANOIA_LEAVE() do { clear_bit(__LC_PARANOIA,&lc->flags); smp_mb__after_clear_bit(); } while (0)
+#define RETURN(x...)     do { PARANOIA_LEAVE(); return x ; } while (0)
+
+/**
+ * lc_alloc: allocates memory for @e_count objects of @e_size bytes plus the
+ * struct lru_cache, and the hash table slots.
+ * returns pointer to a newly initialized lru_cache object with said parameters.
+ */
+struct lru_cache* lc_alloc(unsigned int e_count, size_t e_size,
+			   void *private_p)
+{
+	unsigned long bytes;
+	struct lru_cache   *lc;
+	struct lc_element *e;
+	int i;
+
+	BUG_ON(!e_count);
+	e_size = max(sizeof(struct lc_element),e_size);
+	bytes  = e_size+sizeof(struct hlist_head);
+	bytes *= e_count;
+	bytes += sizeof(struct lru_cache);
+	lc     = vmalloc(bytes);
+	memset(lc, 0, bytes);
+	if (lc) {
+		INIT_LIST_HEAD(&lc->in_use);
+		INIT_LIST_HEAD(&lc->lru);
+		INIT_LIST_HEAD(&lc->free);
+		lc->element_size     = e_size;
+		lc->nr_elements      = e_count;
+		lc->new_number	     = -1;
+		lc->lc_private       = private_p;
+		for(i=0;i<e_count;i++) {
+			e = lc_entry(lc,i);
+			e->lc_number = LC_FREE;
+			list_add(&e->list,&lc->free);
+			// memset(,0,) did the rest of init for us
+		}
+	}
+	return lc;
+}
+
+/**
+ * lc_free: Frees memory allocated by lc_alloc.
+ * @lc: The lru_cache object
+ */
+void lc_free(struct lru_cache* lc)
+{
+	vfree(lc);
+}
+
+static unsigned int lc_hash_fn(struct lru_cache* lc, unsigned int enr)
+{
+	return enr % lc->nr_elements;
+}
+
+
+/**
+ * lc_find: Returns the pointer to an element, if the element is present
+ * in the hash table. In case it is not this function returns NULL.
+ * @lc: The lru_cache object
+ * @enr: element number
+ */
+struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr)
+{
+	struct hlist_node *n;
+	struct lc_element *e;
+
+	BUG_ON(!lc);
+	BUG_ON(!lc->nr_elements);
+	hlist_for_each_entry(e, n, lc->slot + lc_hash_fn(lc, enr), colision) {
+		if (e->lc_number == enr) return e;
+	}
+	return NULL;
+}
+
+STATIC struct lc_element * lc_evict(struct lru_cache* lc)
+{
+	struct list_head  *n;
+	struct lc_element *e;
+
+	if (list_empty(&lc->lru)) return 0;
+
+	n=lc->lru.prev;
+	e=list_entry(n, struct lc_element,list);
+
+	list_del(&e->list);
+	hlist_del(&e->colision);
+	return e;
+}
+
+/**
+ * lc_del: Removes an element from the cache (and therefore adds the
+ * element's storage to the free list)
+ *
+ * @lc: The lru_cache object
+ * @e: The element to remove
+ */
+void lc_del(struct lru_cache* lc, struct lc_element *e)
+{
+	// FIXME what to do with refcnt != 0 ?
+	PARANOIA_ENTRY();
+	BUG_ON(e->refcnt);
+	list_del(&e->list);
+	hlist_del(&e->colision);
+	e->lc_number = LC_FREE;
+	e->refcnt = 0;
+	list_add(&e->list,&lc->free);
+	RETURN();
+}
+
+STATIC struct lc_element* lc_get_unused_element(struct lru_cache* lc)
+{
+	struct list_head *n;
+
+	if (list_empty(&lc->free)) return lc_evict(lc);
+
+	n=lc->free.next;
+	list_del(n);
+	return list_entry(n, struct lc_element,list);
+}
+
+STATIC int lc_unused_element_available(struct lru_cache* lc)
+{
+	if (!list_empty(&lc->free)) return 1; // something on the free list
+	if (!list_empty(&lc->lru)) return 1;  // something to evict
+
+	return 0;
+}
+
+
+/**
+ * lc_get: Finds an element in the cache, increases its usage count,
+ * "touches" and returns it.
+ * In case the requested number is not present, it needs to be added to the
+ * cache. Therefore it is possible that an other element becomes eviced from
+ * the cache. In either case, the user is notified so he is able to e.g. keep
+ * a persistent log of the cache changes, and therefore the objects in use.
+ *
+ * Return values:
+ *  NULL    if the requested element number was not in the cache, and no unused
+ *          element could be recycled
+ *  pointer to the element with the REQUESTED element number
+ *          In this case, it can be used right away
+ *
+ *  pointer to an UNUSED element with some different element number.
+ *          In this case, the cache is marked dirty, and the returned element
+ *          pointer is removed from the lru list and hash collision chains.
+ *          The user now should do whatever houskeeping is necessary. Then he
+ *          needs to call lc_element_changed(lc,element_pointer), to finish the
+ *          change.
+ *
+ * NOTE: The user needs to check the lc_number on EACH use, so he recognizes
+ *       any cache set change.
+ *
+ * @lc: The lru_cache object
+ * @enr: element number
+ */
+struct lc_element* lc_get(struct lru_cache* lc, unsigned int enr)
+{
+	struct lc_element *e;
+
+	BUG_ON(!lc);
+	BUG_ON(!lc->nr_elements);
+
+	PARANOIA_ENTRY();
+	if ( lc->flags & LC_STARVING ) RETURN(NULL);
+
+	e = lc_find(lc, enr);
+	if (e) {
+		++e->refcnt;
+		list_move(&e->list,&lc->in_use); // Not evictable...
+		RETURN(e);
+	}
+
+	/* In case there is nothing available and we can not kick out
+	 * the LRU element, we have to wait ...
+	 */
+	if(!lc_unused_element_available(lc)) {
+		__set_bit(__LC_STARVING,&lc->flags);
+		RETURN(NULL);
+	}
+
+	/* it was not present in the cache, find an unused element,
+	 * which then is replaced.
+	 * we need to update the cache; serialize on lc->flags & LC_DIRTY
+	 */
+	if (test_and_set_bit(__LC_DIRTY,&lc->flags)) RETURN(NULL);
+
+	e = lc_get_unused_element(lc);
+	BUG_ON(!e);
+
+	clear_bit(__LC_STARVING,&lc->flags);
+	BUG_ON(++e->refcnt != 1);
+
+	lc->changing_element = e;
+	lc->new_number = enr;
+
+	RETURN(e);
+}
+
+void lc_changed(struct lru_cache* lc, struct lc_element* e)
+{
+	PARANOIA_ENTRY();
+	BUG_ON(e != lc->changing_element);
+	e->lc_number = lc->new_number;
+	list_add(&e->list,&lc->in_use);
+	hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc, lc->new_number) );
+	lc->changing_element = NULL;
+	lc->new_number = -1;
+	clear_bit(__LC_DIRTY,&lc->flags);
+	smp_mb__after_clear_bit();
+	PARANOIA_LEAVE();
+}
+
+
+unsigned int lc_put(struct lru_cache* lc, struct lc_element* e)
+{
+	BUG_ON(!lc);
+	BUG_ON(!lc->nr_elements);
+	BUG_ON(!e);
+
+	PARANOIA_ENTRY();
+	BUG_ON(e->refcnt == 0);
+	if ( --e->refcnt == 0) {
+		list_move(&e->list,&lc->lru); // move it to the front of LRU.
+		clear_bit(__LC_STARVING,&lc->flags);
+		smp_mb__after_clear_bit();
+	}
+	RETURN(e->refcnt);
+}
+
+
+/**
+ * lc_set: Sets an element in the cache. You might use this function to
+ * setup the cache. It is expected that the elements are properly initialized.
+ * @lc: The lru_cache object
+ * @enr: element number
+ * @index: The elements' position in the cache
+ */
+void lc_set(struct lru_cache* lc, unsigned int enr, int index)
+{
+	struct lc_element *e;
+
+	if ( index < 0 || index >= lc->nr_elements ) return;
+
+	e = lc_entry(lc,index);
+	e->lc_number = enr;
+
+	hlist_del_init(&e->colision);
+	hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc,enr) );
+	list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
+}
+
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/lru_cache.h	2005-08-24 18:45:04.000000000 +0400
@@ -0,0 +1,144 @@
+/*
+-*- linux-c -*-
+   lru_cache.c
+   Kernel module for 2.4.x/2.6.x Kernels
+
+   This file is part of drbd by Philipp Reisner.
+
+   Copyright (C) 2003-2004, Philipp Reisner <philipp.reisner@linbit.com>.
+        main author.
+
+   drbd is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   drbd is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with drbd; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+/*
+  The lru_cache describes a big set of objects that are addressed
+  by an index number (=lc_number). Only a small fraction of this set
+  is present in the cache.
+  (You set the size of the cache using lc_resize)
+  Once created, the api consists of
+    lc_find(,nr) -- finds the object with the given number, if present
+    lc_get(,nr)  -- finds the object and increases the usage count
+                    if not present, actions are taken to make sure that
+		    the cache is updated, the user is notified of this by a callback.
+		    Return value is NULL in this case.
+		    As soon as the user informs the cache that it has been updated,
+		    the next lc_get on that very object number will be successfull.
+    lc_put(,lc_element*)
+                 -- decreases the usage count of this object, and returns the new value.
+
+    NOTE: It is the USERS responsibility to make sure that calls do not happen concurrently.
+ */
+
+#ifndef LRU_CACHE_H
+#define LRU_CACHE_H
+
+#include <linux/list.h>
+#ifndef HLIST_HEAD_INIT
+# include "hlist.h"
+#endif
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION (2,4,20)
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+#endif
+
+#ifndef max
+// For RH 2.4.9
+# define max(x,y) \
+	({ typeof(x) __x = (x); typeof(y) __y = (y); \
+	   (void)(&__x == &__y); \
+	   __x > __y ? __x: __y; })
+#endif
+
+#ifndef BUG_ON
+	/* for ancient 2.4 kernels */
+# define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0)
+#endif
+
+struct lc_element {
+	struct hlist_node colision;
+	struct list_head list;           // LRU list or free list
+	unsigned int refcnt;
+	unsigned int lc_number;
+};
+
+struct lru_cache {
+	struct list_head lru;
+	struct list_head free;
+	struct list_head in_use;
+	size_t element_size;
+	unsigned int  nr_elements;
+	unsigned int  new_number;
+	unsigned long flags;
+	struct lc_element *changing_element; // just for paranoia
+
+	void  *lc_private;
+
+	struct hlist_head slot[0];
+	// hash colision chains here, then element storage.
+};
+
+
+// flag-bits for lru_cache
+enum {
+	__LC_PARANOIA,
+	__LC_DIRTY,
+	__LC_STARVING,
+};
+#define LC_PARANOIA (1<<__LC_PARANOIA)
+#define LC_DIRTY    (1<<__LC_DIRTY)
+#define LC_STARVING (1<<__LC_STARVING)
+
+extern struct lru_cache* lc_alloc(unsigned int e_count, size_t e_size,
+				  void *private_p);
+extern void lc_free(struct lru_cache* lc);
+extern void lc_set (struct lru_cache* lc, unsigned int enr, int index);
+extern void lc_del (struct lru_cache* lc, struct lc_element *element);
+
+extern struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr);
+extern struct lc_element* lc_get (struct lru_cache* lc, unsigned int enr);
+extern unsigned int       lc_put (struct lru_cache* lc, struct lc_element* e);
+extern void            lc_changed(struct lru_cache* lc, struct lc_element* e);
+
+
+/* This can be used to stop lc_get from changing the set of active elements.
+ * Note that the reference counts and order on the lru list may still change.
+ * returns true if we aquired the lock.
+ */
+static inline int lc_try_lock(struct lru_cache* lc)
+{
+	return !test_and_set_bit(__LC_DIRTY,&lc->flags);
+}
+
+static inline void lc_unlock(struct lru_cache* lc)
+{
+	clear_bit(__LC_DIRTY,&lc->flags);
+	smp_mb__after_clear_bit();
+}
+
+#define LC_FREE (-1)
+
+#define lc_e_base(lc)  ((char*) ( (lc)->slot + (lc)->nr_elements ) )
+#define lc_entry(lc,i) ((struct lc_element*) \
+                       (lc_e_base(lc) + (i)*(lc)->element_size))
+#define lc_index_of(lc,e) (((char*)(e) - lc_e_base(lc))/(lc)->element_size)
+
+#endif
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/mempool-2.4.c	2004-09-21 11:28:39.000000000 +0400
@@ -0,0 +1,335 @@
+/*
+ *  linux/mm/mempool.c
+ *
+ *  memory buffer pool support. Such pools are mostly used
+ *  for guaranteed, deadlock-free memory allocations during
+ *  extreme VM load.
+ *
+ *  started by Ingo Molnar, Copyright (C) 2001
+ *  modified for inclusion with DRBD in 2003 by Philipp Reisner.
+ */
+
+#include <linux/compiler.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include "mempool.h"
+
+#ifndef BUG_ON
+# define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0)
+#endif
+
+/**
+ * mempool_create - create a memory pool
+ * @min_nr:    the minimum number of elements guaranteed to be
+ *             allocated for this pool.
+ * @alloc_fn:  user-defined element-allocation function.
+ * @free_fn:   user-defined element-freeing function.
+ * @pool_data: optional private data available to the user-defined functions.
+ *
+ * this function creates and allocates a guaranteed size, preallocated
+ * memory pool. The pool can be used from the mempool_alloc and mempool_free
+ * functions. This function might sleep. Both the alloc_fn() and the free_fn()
+ * functions might sleep - as long as the mempool_alloc function is not called
+ * from IRQ contexts.
+ */
+mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+				mempool_free_t *free_fn, void *pool_data)
+{
+	mempool_t *pool;
+	int i;
+
+	BUG_ON(!alloc_fn);
+	BUG_ON(!free_fn);
+
+	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool)
+		return NULL;
+	memset(pool, 0, sizeof(*pool));
+
+	spin_lock_init(&pool->lock);
+	pool->min_nr = min_nr;
+	pool->pool_data = pool_data;
+	INIT_LIST_HEAD(&pool->elements);
+	init_waitqueue_head(&pool->wait);
+	pool->alloc = alloc_fn;
+	pool->free = free_fn;
+
+	/*
+	 * First pre-allocate the guaranteed number of buffers
+	 * and nodes for them.
+	 */
+	for (i = 0; i < min_nr; i++) {
+		void *element;
+		mempool_node_t *node;
+
+		node = kmalloc(sizeof(*node), GFP_KERNEL);
+		element = NULL;
+		if (node)
+			element = pool->alloc(GFP_KERNEL, pool->pool_data);
+
+		if (unlikely(!element)) {
+			/*
+			 * Not enough memory - free the allocated ones
+			 * and return.  `node' may be NULL here.
+			 */
+			kfree(node);
+			while (!list_empty(&pool->elements)) {
+				node = list_entry(pool->elements.next,
+						mempool_node_t, list);
+				list_del(&node->list);
+				pool->free(node->element, pool->pool_data);
+				kfree(node);
+			}
+			kfree(pool);
+			return NULL;
+		}
+		node->element = element;
+		list_add(&node->list, &pool->elements);
+		pool->curr_nr++;
+	}
+	return pool;
+}
+
+/**
+ * mempool_resize - resize an existing memory pool
+ * @pool:       pointer to the memory pool which was allocated via
+ *              mempool_create().
+ * @new_min_nr: the new minimum number of elements guaranteed to be
+ *              allocated for this pool.
+ * @gfp_mask:   the usual allocation bitmask.
+ *
+ * This function shrinks/grows the pool. In the case of growing,
+ * it cannot be guaranteed that the pool will be grown to the new
+ * size immediately, but new mempool_free() calls will refill it.
+ *
+ * Note, the caller must guarantee that no mempool_destroy is called
+ * while this function is running. mempool_alloc() & mempool_free()
+ * might be called (eg. from IRQ contexts) while this function executes.
+ */
+void mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask)
+{
+	int delta;
+	unsigned long flags;
+
+	if (new_min_nr <= 0)
+		BUG();
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (new_min_nr < pool->min_nr) {
+		pool->min_nr = new_min_nr;
+		/*
+		 * Free possible excess elements.
+		 */
+		while (pool->curr_nr > pool->min_nr) {
+			mempool_node_t *node;
+
+			if (list_empty(&pool->elements))
+				BUG();
+			node = list_entry(pool->elements.next,
+					mempool_node_t, list);
+			if (node->element == NULL)
+				BUG();
+			list_del(&node->list);
+			pool->curr_nr--;
+			spin_unlock_irqrestore(&pool->lock, flags);
+			pool->free(node->element, pool->pool_data);
+			kfree(node);
+			spin_lock_irqsave(&pool->lock, flags);
+		}
+		spin_unlock_irqrestore(&pool->lock, flags);
+		return;
+	}
+	delta = new_min_nr - pool->min_nr;
+	pool->min_nr = new_min_nr;
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/*
+	 * We refill the pool up to the new treshold - but we dont
+	 * (cannot) guarantee that the refill succeeds.
+	 */
+	while (delta) {
+		mempool_node_t *node;
+
+		node = kmalloc(sizeof(*node), gfp_mask);
+		if (!node)
+			break;
+		node->element = pool->alloc(gfp_mask, pool->pool_data);
+		if (!node->element) {
+			kfree(node);
+			break;
+		}
+		spin_lock_irqsave(&pool->lock, flags);
+		list_add(&node->list, &pool->elements);
+		pool->curr_nr++;
+		spin_unlock_irqrestore(&pool->lock, flags);
+		delta--;
+	}
+	wake_up(&pool->wait);
+}
+
+/**
+ * mempool_destroy - deallocate a memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps. The caller
+ * has to guarantee that no mempool_alloc() nor mempool_free() happens in
+ * this pool when calling this function.
+ *
+ * This function will go BUG() if there are outstanding elements in the
+ * pool.  The mempool client must put them all back before destroying the
+ * mempool.
+ */
+void mempool_destroy(mempool_t *pool)
+{
+	if (!pool)
+		return;
+
+	if (pool->curr_nr != pool->min_nr)
+		printk(KERN_ERR "drbd: in %s(%p): curr_nr(%d) != min_nr(%d)\n",
+		       __func__,pool,pool->curr_nr,pool->min_nr);
+	while (!list_empty(&pool->elements)) {
+		mempool_node_t *node;
+
+		node = list_entry(pool->elements.prev,
+				mempool_node_t, list);
+		list_del(&node->list);
+		if (node->element) {
+			pool->curr_nr--;
+			pool->free(node->element, pool->pool_data);
+		}
+		kfree(node);
+	}
+	if (pool->curr_nr)
+		BUG();
+	kfree(pool);
+}
+
+/**
+ * mempool_alloc - allocate an element from a specific memory pool
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ * @gfp_mask:  the usual allocation bitmask.
+ *
+ * this function only sleeps if the alloc_fn function sleeps or
+ * returns NULL. Note that due to preallocation, this function
+ * *never* fails when called from process contexts. (it might
+ * fail if called from an IRQ context.)
+ */
+void * mempool_alloc(mempool_t *pool, int gfp_mask)
+{
+	void *element;
+	unsigned long flags;
+	int curr_nr;
+	DECLARE_WAITQUEUE(wait, current);
+	int gfp_nowait = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
+repeat_alloc:
+	element = pool->alloc(gfp_nowait, pool->pool_data);
+	if (likely(element != NULL))
+		return element;
+
+	/*
+	 * If the pool is less than 50% full then try harder
+	 * to allocate an element:
+	 */
+	if ((gfp_mask != gfp_nowait) && (pool->curr_nr <= pool->min_nr/2)) {
+		element = pool->alloc(gfp_mask, pool->pool_data);
+		if (likely(element != NULL))
+			return element;
+	}
+
+	/*
+	 * Kick the VM at this point.
+	 */
+	// wakeup_bdflush();  -- Modules can not do this; PRE
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (likely(pool->curr_nr)) {
+		mempool_node_t *node;
+
+		node = list_entry(pool->elements.next,
+				mempool_node_t, list);
+		list_del(&node->list);
+		element = node->element;
+		if (element == NULL)
+			BUG();
+		node->element = NULL;
+		list_add_tail(&node->list, &pool->elements);
+		pool->curr_nr--;
+		spin_unlock_irqrestore(&pool->lock, flags);
+		return element;
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	/* We must not sleep in the GFP_ATOMIC case */
+	if (gfp_mask == gfp_nowait)
+		return NULL;
+
+	run_task_queue(&tq_disk);
+
+	add_wait_queue_exclusive(&pool->wait, &wait);
+	set_task_state(current, TASK_UNINTERRUPTIBLE);
+
+	spin_lock_irqsave(&pool->lock, flags);
+	curr_nr = pool->curr_nr;
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	if (!curr_nr)
+		schedule();
+
+	current->state = TASK_RUNNING;
+	remove_wait_queue(&pool->wait, &wait);
+
+	goto repeat_alloc;
+}
+
+/**
+ * mempool_free - return an element to the pool.
+ * @element:   pool element pointer.
+ * @pool:      pointer to the memory pool which was allocated via
+ *             mempool_create().
+ *
+ * this function only sleeps if the free_fn() function sleeps.
+ */
+void mempool_free(void *element, mempool_t *pool)
+{
+	unsigned long flags;
+
+	if (pool->curr_nr < pool->min_nr) {
+		spin_lock_irqsave(&pool->lock, flags);
+		if (pool->curr_nr < pool->min_nr) {
+			mempool_node_t *node;
+
+			node = list_entry(pool->elements.prev,
+					mempool_node_t, list);
+			list_del(&node->list);
+			if (node->element)
+				BUG();
+			node->element = element;
+			list_add(&node->list, &pool->elements);
+			pool->curr_nr++;
+			spin_unlock_irqrestore(&pool->lock, flags);
+			wake_up(&pool->wait);
+			return;
+		}
+		spin_unlock_irqrestore(&pool->lock, flags);
+	}
+	pool->free(element, pool->pool_data);
+}
+
+/*
+ * A commonly used alloc and free fn.
+ */
+void *mempool_alloc_slab(int gfp_mask, void *pool_data)
+{
+	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	return kmem_cache_alloc(mem, gfp_mask);
+}
+
+void mempool_free_slab(void *element, void *pool_data)
+{
+	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	kmem_cache_free(mem, element);
+}
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./drivers/block/drbd/mempool.h	2005-08-24 18:45:04.000000000 +0400
@@ -0,0 +1,49 @@
+/*
+ * memory buffer pool support
+ */
+#ifndef _LINUX_MEMPOOL_H
+#define _LINUX_MEMPOOL_H
+
+#include <linux/list.h>
+#include <linux/wait.h>
+
+typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data);
+typedef void (mempool_free_t)(void *element, void *pool_data);
+
+/*
+ * A structure for linking multiple client objects into
+ * a mempool_t
+ */
+typedef struct mempool_node_s {
+	struct list_head list;
+	void *element;
+} mempool_node_t;
+
+/*
+ * The elements list has full mempool_node_t's at ->next, and empty ones
+ * at ->prev.  Emptiness is signified by mempool_node_t.element == NULL.
+ *
+ * curr_nr refers to how many full mempool_node_t's are at ->elements.
+ * We don't track the total number of mempool_node_t's at ->elements;
+ * it is always equal to min_nr.
+ */
+typedef struct mempool_s {
+	spinlock_t lock;
+	int min_nr, curr_nr;
+	struct list_head elements;
+
+	void *pool_data;
+	mempool_alloc_t *alloc;
+	mempool_free_t *free;
+	wait_queue_head_t wait;
+} mempool_t;
+extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
+				 mempool_free_t *free_fn, void *pool_data);
+extern void mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask);
+extern void mempool_destroy(mempool_t *pool);
+extern void * mempool_alloc(mempool_t *pool, int gfp_mask);
+extern void mempool_free(void *element, mempool_t *pool);
+extern void *mempool_alloc_slab(int gfp_mask, void *pool_data);
+extern void mempool_free_slab(void *element, void *pool_data);
+
+#endif /* _LINUX_MEMPOOL_H */
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./include/linux/drbd.h	2004-11-02 11:57:34.000000000 +0300
@@ -0,0 +1,246 @@
+/*
+  drbd.h
+  Kernel module for 2.4.x/2.6.x Kernels
+
+  This file is part of drbd by Philipp Reisner.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+#ifndef DRBD_H
+#define DRBD_H
+#include <linux/drbd_config.h>
+
+#include <asm/types.h>
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/ioctl.h>
+#else
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <limits.h>
+#endif
+
+#ifdef __KERNEL__
+#define IN const
+#define OUT
+#define INOUT
+#else
+#define IN
+#define OUT const
+#define INOUT
+#endif
+
+/* 
+   - Never forget to place bigger members before the smaller ones, 
+     to avoid unaligned placement of members on 64 bit architectures. 
+   - Never forget to add explicit _pad members to make sizeof(struct)
+     divisible by 8.
+*/
+
+#define MAX_SOCK_ADDR	128	/* 108 for Unix domain -
+				   16 for IP, 16 for IPX,
+				   24 for IPv6,
+				   about 80 for AX.25
+				   must be at least one bigger than
+				   the AF_UNIX size (see net/unix/af_unix.c
+				   :unix_mkname()).
+				 */
+
+enum io_error_handler {
+	PassOn,
+	Panic,
+	Detach
+};
+
+
+struct disk_config {
+	IN __u64    disk_size;
+	IN int      lower_device;
+	IN enum io_error_handler on_io_error;
+	IN int      meta_device;
+	IN int      meta_index;
+};
+
+enum disconnect_handler {
+	Reconnect,
+	DropNetConf,
+	FreezeIO
+};
+
+struct net_config {
+	IN char     my_addr[MAX_SOCK_ADDR];
+	IN char     other_addr[MAX_SOCK_ADDR];
+	IN int      my_addr_len;
+	IN int      other_addr_len;
+	IN int      timeout;          // deci seconds
+	IN int      wire_protocol;
+	IN int      try_connect_int;  /* seconds */
+	IN int      ping_int;         /* seconds */
+	IN int      max_epoch_size;
+	IN int      max_buffers;
+	IN int      sndbuf_size;  /* socket send buffer size */
+	IN unsigned int ko_count;
+	IN enum disconnect_handler on_disconnect;
+	const int   _pad;
+};
+
+struct syncer_config {
+	int      rate; /* KB/sec */
+	int      use_csums;   /* use checksum based syncing*/
+	int      skip;
+	int      group;
+	int      al_extents;
+	const int _pad;
+};
+
+/* KEEP the order, do not delete or insert!
+ * Or change the API_VERSION, too. */
+enum ret_codes {
+	NoError=0,
+	LAAlreadyInUse,
+	OAAlreadyInUse,
+	LDFDInvalid,
+	MDFDInvalid,
+	LDAlreadyInUse,
+	LDNoBlockDev,
+	MDNoBlockDev,
+	LDOpenFailed,
+	MDOpenFailed,
+	LDDeviceTooSmall,
+	MDDeviceTooSmall,
+	LDNoConfig,
+	LDMounted,
+	MDMounted,
+	LDMDInvalid,
+	LDDeviceTooLarge,
+};
+
+struct ioctl_disk_config {
+	struct disk_config    config;
+	OUT enum ret_codes    ret_code;
+	const int             _pad;
+};
+
+struct ioctl_net_config {
+	struct net_config     config;
+	OUT enum ret_codes    ret_code;
+	const int             _pad;
+};
+
+struct ioctl_syncer_config {
+	struct syncer_config  config;
+	OUT enum ret_codes    ret_code;
+	const int             _pad;
+};
+
+struct ioctl_wait {
+	IN int wfc_timeout;
+	IN int degr_wfc_timeout;
+	OUT int ret_code;
+	int      _pad;
+};
+
+#define DRBD_PROT_A   1
+#define DRBD_PROT_B   2
+#define DRBD_PROT_C   3
+
+typedef enum {
+	Unknown=0,
+	Primary=1,     // role
+	Secondary=2,   // role
+	Human=4,           // flag for set_state
+	TimeoutExpired=8,  // flag for set_state
+	DontBlameDrbd=16   // flag for set_state
+} Drbd_State;
+
+/* The order of these constants is important.
+ * The lower ones (<WFReportParams) indicate
+ * that there is no socket!
+ * >=WFReportParams ==> There is a socket
+ *
+ * THINK
+ * Skipped should be < Connected,
+ * so writes on a Primary after Skipped sync are not mirrored either ?
+ */
+typedef enum {
+	Unconfigured,
+	StandAlone,
+	Unconnected,
+	Timeout,
+	BrokenPipe,
+	NetworkFailure,
+	WFConnection,
+	WFReportParams, // we have a socket
+	Connected,      // we have introduced each other
+	SkippedSyncS,   // we should have synced, but user said no
+	SkippedSyncT,
+	WFBitMapS,
+	WFBitMapT,
+	SyncSource,     // The distance between original state and pause
+	SyncTarget,     // state must be the same for source and target. (+2)
+	PausedSyncS,    // see _drbd_rs_resume() and _drbd_rs_pause()
+	PausedSyncT,    // is sync target, but higher priority groups first
+} Drbd_CState;
+
+#ifndef BDEVNAME_SIZE
+# define BDEVNAME_SIZE 32
+#endif
+
+struct ioctl_get_config {
+	OUT __u64             disk_size_user;
+	OUT char              lower_device_name[BDEVNAME_SIZE];
+	OUT char              meta_device_name[BDEVNAME_SIZE];
+	struct net_config     nconf;
+	struct syncer_config  sconf;
+	OUT int               lower_device_major;
+	OUT int               lower_device_minor;
+	OUT enum io_error_handler on_io_error;
+	OUT int               meta_device_major;
+	OUT int               meta_device_minor;
+	OUT int               meta_index;
+	OUT Drbd_CState       cstate;
+	OUT Drbd_State        state;
+	OUT Drbd_State        peer_state;
+	int                   _pad;
+};
+
+#define DRBD_MAGIC 0x83740267
+#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
+
+/* 'D' already taken by s390 dasd driver.
+ *  maybe we want to change to something else, and register it officially?
+ */
+#define DRBD_IOCTL_LETTER 'D'
+#define DRBD_IOCTL_GET_VERSION      _IOR( DRBD_IOCTL_LETTER, 0x00, int )
+#define DRBD_IOCTL_SET_STATE        _IOW( DRBD_IOCTL_LETTER, 0x02, Drbd_State )
+#define DRBD_IOCTL_SET_DISK_CONFIG  _IOW( DRBD_IOCTL_LETTER, 0x06, struct ioctl_disk_config )
+#define DRBD_IOCTL_SET_NET_CONFIG   _IOW( DRBD_IOCTL_LETTER, 0x07, struct ioctl_net_config )
+#define DRBD_IOCTL_UNCONFIG_NET     _IO ( DRBD_IOCTL_LETTER, 0x08 )
+#define DRBD_IOCTL_GET_CONFIG       _IOW( DRBD_IOCTL_LETTER, 0x0A, struct ioctl_get_config )
+#define DRBD_IOCTL_INVALIDATE       _IO ( DRBD_IOCTL_LETTER, 0x0D )
+#define DRBD_IOCTL_INVALIDATE_REM   _IO ( DRBD_IOCTL_LETTER, 0x0E )
+#define DRBD_IOCTL_SET_SYNC_CONFIG  _IOW( DRBD_IOCTL_LETTER, 0x0F, struct ioctl_syncer_config )
+#define DRBD_IOCTL_SET_DISK_SIZE    _IOW( DRBD_IOCTL_LETTER, 0x10, unsigned int )
+#define DRBD_IOCTL_WAIT_CONNECT     _IOR( DRBD_IOCTL_LETTER, 0x11, struct ioctl_wait )
+#define DRBD_IOCTL_WAIT_SYNC        _IOR( DRBD_IOCTL_LETTER, 0x12, struct ioctl_wait )
+#define DRBD_IOCTL_UNCONFIG_DISK    _IO ( DRBD_IOCTL_LETTER, 0x13 )
+#define DRBD_IOCTL_SET_STATE_FLAGS  _IOW( DRBD_IOCTL_LETTER, 0x14, Drbd_State )
+
+
+#endif
+
--- /dev/null	2003-04-26 02:10:32.000000000 +0400
+++ ./include/linux/drbd_config.h	2006-02-13 17:39:11.000000000 +0300
@@ -0,0 +1,68 @@
+/*
+  drbd_config.h
+  DRBD's compile time configuration.
+
+  drbd is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2, or (at your option)
+  any later version.
+
+  drbd is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with drbd; see the file COPYING.  If not, write to
+  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#ifndef DRBD_CONFIG_H
+#define DRBD_CONFIG_H
+
+extern const char * drbd_buildtag(void);
+
+#define REL_VERSION "0.7.16"
+#define API_VERSION 77
+#define PRO_VERSION 74
+
+//#define DBG_ALL_SYMBOLS // no static functs, improves quality of OOPS traces
+
+//#define DBG_SPINLOCKS   // enables MUST_HOLD macro (assertions for spinlocks)
+//#define DBG_ASSERTS     // drbd_assert_breakpoint() function
+//#define DUMP_MD 1       // Dump metadata to syslog upon connect
+#define DUMP_MD 2       // Dump even all cstate changes (I like it!)
+//#define DUMP_MD 3       // Dump even all meta data access
+                          // (don't! unless we track down a bug...)
+
+//#define SIGHAND_HACK           // Needed for RH 2.4.20 and later kernels.
+//#define REDHAT_HLIST_BACKPORT  // Makes DRBD work on RH9 kernels
+
+/* some redhat 2.4.X-Y.Z.whatever kernel flavours have an mm_inline.h,
+ * which needs to be included explicitly. most 2.4.x kernels don't have that
+ * header file at all. So uncomment for these, and ignore for all others.
+ * in 2.6., it will be included anyways.
+ */
+//#define HAVE_MM_INLINE_H
+
+//Your 2.4 verndor kernel already defines find_next_bit()
+//#define HAVE_FIND_NEXT_BIT
+
+//Your 2.4 kernel does not define find_next_bit(),
+//and you are too lazy to "backport" it from 2.6 for your arch:
+//#define USE_GENERIC_FIND_NEXT_BIT
+
+//#define PARANOIA // some extra checks
+
+// don't enable this, unless you can cope with gigabyte syslogs :)
+//#define DUMP_EACH_PACKET
+
+// Dump every hour the usage / not usage of zero copy IO 
+//#define SHOW_SENDPAGE_USAGE
+
+// You can disable the use of the sendpage() call (= zero copy
+// IO )  If you have the feeling that this might be the cause
+// for troubles.
+// #define DRBD_DISABLE_SENDPAGE
+
+#endif