--- ./drivers/block/Kconfig.drbd 2006-03-31 21:06:51.000000000 +0400 +++ ./drivers/block/Kconfig 2006-03-31 21:07:01.000000000 +0400 @@ -356,4 +356,6 @@ config ATA_OVER_ETH source "drivers/s390/block/Kconfig" +source "drivers/block/drbd/Kconfig" + endmenu --- ./drivers/block/Makefile.drbd 2006-03-31 21:06:51.000000000 +0400 +++ ./drivers/block/Makefile 2006-03-31 21:07:01.000000000 +0400 @@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o obj-$(CONFIG_ATA_OVER_ETH) += aoe/ +obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_UMEM) += umem.o obj-$(CONFIG_BLK_DEV_NBD) += nbd.o --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/Kconfig 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,34 @@ +# +# DRBD device driver configuration +# +config BLK_DEV_DRBD + tristate "DRBD Distributed replicated block device support" + select INET + select PROC_FS + ---help--- + Drbd is a block device which is designed to build high availability + clusters. This is done by mirroring a whole block device via (a + dedicated) network. You could see it as a network RAID 1. + + Each device (drbd provides more than one of these devices) has a + state, which can be 'primary' or 'secondary'. On the node with the + primary device the application is supposed to run and to access the + device (/dev/drbdX). Every write is sent to the local 'lower level + block device' and via network to the node with the device in + 'secondary' state. + The secondary device simply writes the data to its lower level block + device. Reads are always carried out locally. + + Drbd management is done through user-space tools. + + Historically DRBD hijacked the NBD major number (43) + and device nodes (/dev/nbX). + We now have an officially assigned major number (147) + and /dev/drbdX. + + If for whatever weird reason you want to keep the old behaviour, + you can give a "use_nbd_major" module parameter. + + http://www.drbd.org/ + + If unsure, say N. --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/Makefile 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,7 @@ +CFLAGS_drbd_sizeof_sanity_check.o = # -Wpadded # -Werror + +drbd-objs := drbd_sizeof_sanity_check.o \ + drbd_buildtag.o drbd_bitmap.o drbd_fs.o drbd_proc.o \ + drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o \ + lru_cache.o drbd_main.o +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_actlog.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,991 @@ +/* +-*- linux-c -*- + drbd_actlog.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 2003-2004, Philipp Reisner . + Copyright (C) 2003-2004, Lars Ellenberg . + authors. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include "drbd_int.h" + +/* This is what I like so much about the linux kernel: + * if you have a close look, you can almost always reuse code by someone else + * ;) + * this is mostly from drivers/md/md.c + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +STATIC int _drbd_md_sync_page_io(drbd_dev *mdev, struct page *page, + sector_t sector, int rw, int size) +{ + struct buffer_head bh; + struct completion event; + int ok; + + init_completion(&event); + init_buffer(&bh, drbd_md_io_complete, &event); + bh.b_rdev = mdev->md_bdev; + bh.b_rsector = sector; + bh.b_state = (1 << BH_Req) | (1 << BH_Mapped) | (1 << BH_Lock); + bh.b_size = size; + bh.b_page = page; + bh.b_reqnext = NULL; + bh.b_data = page_address(page); + generic_make_request(rw, &bh); + + run_task_queue(&tq_disk); + wait_for_completion(&event); + + ok = test_bit(BH_Uptodate, &bh.b_state); + + return ok; +} +#else +STATIC int _drbd_md_sync_page_io(drbd_dev *mdev, struct page *page, + sector_t sector, int rw, int size) +{ + struct bio *bio = bio_alloc(GFP_NOIO, 1); + struct completion event; + int ok; + + bio->bi_bdev = mdev->md_bdev; + bio->bi_sector = sector; + bio_add_page(bio, page, size, 0); + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = drbd_md_io_complete; + +#ifdef BIO_RW_SYNC + submit_bio(rw | (1 << BIO_RW_SYNC), bio); +#else + submit_bio(rw, bio); + drbd_blk_run_queue(bdev_get_queue(mdev->md_bdev)); +#endif + wait_for_completion(&event); + + ok = test_bit(BIO_UPTODATE, &bio->bi_flags); + bio_put(bio); + return ok; +} +#endif + +int drbd_md_sync_page_io(drbd_dev *mdev, sector_t sector, int rw) +{ + int hardsect,mask,ok,offset=0; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct page *iop = mdev->md_io_page; + + D_ASSERT(semaphore_is_locked(&mdev->md_io_mutex)); + + if (!mdev->md_bdev) { + if (test_bit(DISKLESS,&mdev->flags)) return 0; + if (DRBD_ratelimit(5*HZ,5)) { + ERR("mdev->md_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + + + hardsect = drbd_get_hardsect(mdev->md_bdev); + + // in case hardsect != 512 [ s390 only? ] + if( hardsect != MD_HARDSECT ) { + if(!mdev->md_io_tmpp) { + struct page *page = alloc_page(GFP_NOIO); + if(!page) return 0; + + WARN("Meta data's bdev hardsect_size != %d\n", + MD_HARDSECT); + WARN("Workaround engaged (has performace impact).\n"); + + mdev->md_io_tmpp = page; + } + + mask = ( hardsect / MD_HARDSECT ) - 1; + D_ASSERT( mask == 1 || mask == 3 || mask == 7 ); + D_ASSERT( hardsect == (mask+1) * MD_HARDSECT ); + offset = sector & mask; + sector = sector & ~mask; + iop = mdev->md_io_tmpp; + + if (rw == WRITE) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + ok = _drbd_md_sync_page_io(mdev,iop, + sector,READ,hardsect); + + if (unlikely(!ok)) return 0; + + memcpy(hp + offset*MD_HARDSECT , p, MD_HARDSECT); + } + } + +#if DUMP_MD >= 3 + INFO("%s [%d]:%s(,%llu,%s)\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, rw ? "WRITE" : "READ"); +#endif + + if (sector < drbd_md_ss(mdev) || + sector > drbd_md_ss(mdev)+MD_BM_OFFSET+BM_SECT_TO_EXT(capacity)) { + ALERT("%s [%d]:%s(,%llu,%s) out of range md access!\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, rw ? "WRITE" : "READ"); + } + + ok = _drbd_md_sync_page_io(mdev,iop,sector,rw,hardsect); + if (unlikely(!ok)) { + ERR("drbd_md_sync_page_io(,%llu,%s) failed!\n", + (unsigned long long)sector,rw ? "WRITE" : "READ"); + } + + if( hardsect != MD_HARDSECT && rw == READ ) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + memcpy(p, hp + offset*MD_HARDSECT, MD_HARDSECT); + } + + return ok; +} + + +struct __attribute__((packed)) al_transaction { + u32 magic; + u32 tr_number; + // u32 tr_generation; //TODO + struct __attribute__((packed)) { + u32 pos; + u32 extent; } updates[1 + AL_EXTENTS_PT]; + u32 xor_sum; + // I do not believe that all storage medias can guarantee atomic + // 512 byte write operations. When the journal is read, only + // transactions with correct xor_sums are considered. +}; // sizeof() = 512 byte + + +struct update_odbm_work { + struct drbd_work w; + unsigned int enr; +}; + +struct update_al_work { + struct drbd_work w; + struct lc_element * al_ext; + struct completion event; + unsigned int enr; +}; + +STATIC int w_al_write_transaction(struct Drbd_Conf *, struct drbd_work *, int); + +static inline +struct lc_element* _al_get(struct Drbd_Conf *mdev, unsigned int enr) +{ + struct lc_element *al_ext; + struct bm_extent *bm_ext; + unsigned long al_flags=0; + + spin_lock_irq(&mdev->al_lock); + bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr/AL_EXT_PER_BM_SECT); + if (unlikely(bm_ext!=NULL)) { + if(test_bit(BME_NO_WRITES,&bm_ext->flags)) { + spin_unlock_irq(&mdev->al_lock); + //INFO("Delaying app write until sync read is done\n"); + return 0; + } + } + al_ext = lc_get(mdev->act_log,enr); + al_flags = mdev->act_log->flags; + spin_unlock_irq(&mdev->al_lock); + + /* + if (!al_ext) { + if (al_flags & LC_STARVING) + WARN("Have to wait for LRU element (AL too small?)\n"); + if (al_flags & LC_DIRTY) + WARN("Ongoing AL update (AL device too slow?)\n"); + } + */ + + return al_ext; +} + +void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9)); + struct lc_element *al_ext; + struct update_al_work al_work; + + D_ASSERT(atomic_read(&mdev->local_cnt)>0); + wait_event(mdev->al_wait, (al_ext = _al_get(mdev,enr)) ); + + if (al_ext->lc_number != enr) { + // We have to do write an transaction to AL. + unsigned int evicted; + + evicted = al_ext->lc_number; + + if(mdev->cstate < Connected && evicted != LC_FREE ) { + drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT ); + } + + /* drbd_al_write_transaction(mdev,al_ext,enr); + generic_make_request() are serialized on the + current->bio_tail list now. Therefore we have + to deligate writing something to AL to the + worker thread. */ + init_completion(&al_work.event); + al_work.al_ext = al_ext; + al_work.enr = enr; + al_work.w.cb = w_al_write_transaction; + drbd_queue_work_front(mdev,&mdev->data.work,&al_work.w); + wait_for_completion(&al_work.event); + + mdev->al_writ_cnt++; + + /* + DUMPI(al_ext->lc_number); + DUMPI(mdev->act_log->new_number); + */ + spin_lock_irq(&mdev->al_lock); + lc_changed(mdev->act_log,al_ext); + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); + } +} + +void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SIZE_B-9)); + struct lc_element *extent; + unsigned long flags; + + spin_lock_irqsave(&mdev->al_lock,flags); + + extent = lc_find(mdev->act_log,enr); + + if(!extent) { + spin_unlock_irqrestore(&mdev->al_lock,flags); + ERR("al_complete_io() called on inactive extent %u\n",enr); + return; + } + + if( lc_put(mdev->act_log,extent) == 0 ) { + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock,flags); +} + +STATIC int +w_al_write_transaction(struct Drbd_Conf *mdev, struct drbd_work *w, int unused) +{ + int i,n,mx; + unsigned int extent_nr; + struct al_transaction* buffer; + sector_t sector; + u32 xor_sum=0; + + struct lc_element *updated = ((struct update_al_work*)w)->al_ext; + unsigned int new_enr = ((struct update_al_work*)w)->enr; + + down(&mdev->md_io_mutex); // protects md_io_buffer, al_tr_cycle, ... + buffer = (struct al_transaction*)page_address(mdev->md_io_page); + + buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); + buffer->tr_number = cpu_to_be32(mdev->al_tr_number); + + n = lc_index_of(mdev->act_log, updated); + + buffer->updates[0].pos = cpu_to_be32(n); + buffer->updates[0].extent = cpu_to_be32(new_enr); + +#if 0 /* Use this printf with the test_al.pl program */ + ERR("T%03d S%03d=E%06d\n", mdev->al_tr_number,n,new_enr); +#endif + + xor_sum ^= new_enr; + + mx = min_t(int,AL_EXTENTS_PT, + mdev->act_log->nr_elements - mdev->al_tr_cycle); + for(i=0;iact_log, + mdev->al_tr_cycle+i)->lc_number; + buffer->updates[i+1].pos = cpu_to_be32(mdev->al_tr_cycle+i); + buffer->updates[i+1].extent = cpu_to_be32(extent_nr); + xor_sum ^= extent_nr; + } + for(;iupdates[i+1].pos = __constant_cpu_to_be32(-1); + buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); + xor_sum ^= LC_FREE; + } + mdev->al_tr_cycle += AL_EXTENTS_PT; + if(mdev->al_tr_cycle >= mdev->act_log->nr_elements) mdev->al_tr_cycle=0; + + buffer->xor_sum = cpu_to_be32(xor_sum); + + + sector = drbd_md_ss(mdev) + MD_AL_OFFSET + mdev->al_tr_pos ; + + if(!drbd_md_sync_page_io(mdev,sector,WRITE)) { + drbd_chk_io_error(mdev, 1); + drbd_io_error(mdev); + } + + if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) { + mdev->al_tr_pos=0; + } + D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); + mdev->al_tr_number++; + + up(&mdev->md_io_mutex); + + complete(&((struct update_al_work*)w)->event); + + return 1; +} + +/** + * drbd_al_read_tr: Reads a single transaction record form the + * on disk activity log. + * Returns -1 on IO error, 0 on checksum error and 1 if it is a valid + * record. + */ +STATIC int drbd_al_read_tr(struct Drbd_Conf *mdev, + struct al_transaction* b, + int index) +{ + sector_t sector; + int rv,i; + u32 xor_sum=0; + + sector = drbd_md_ss(mdev) + MD_AL_OFFSET + index; + + if(!drbd_md_sync_page_io(mdev,sector,READ)) { + drbd_chk_io_error(mdev, 1); + drbd_io_error(mdev); + return -1; + } + + rv = ( be32_to_cpu(b->magic) == DRBD_MAGIC ); + + for(i=0;iupdates[i].extent); + } + rv &= (xor_sum == be32_to_cpu(b->xor_sum)); + + return rv; +} + +/** + * drbd_al_read_log: Restores the activity log from its on disk + * representation. Returns 1 on success, returns 0 when + * reading the log failed due to IO errors. + */ +int drbd_al_read_log(struct Drbd_Conf *mdev) +{ + struct al_transaction* buffer; + int from=-1,to=-1,i,cnr, overflow=0,rv; + u32 from_tnr=-1, to_tnr=0; + int active_extents=0; + int transactions=0; + int mx; + + mx = div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT); + + /* lock out all other meta data io for now, + * and make sure the page is mapped. + */ + down(&mdev->md_io_mutex); + buffer = page_address(mdev->md_io_page); + + // Find the valid transaction in the log + for(i=0;i<=mx;i++) { + rv = drbd_al_read_tr(mdev,buffer,i); + if(rv == 0) continue; + if(rv == -1) { + up(&mdev->md_io_mutex); + return 0; + } + cnr = be32_to_cpu(buffer->tr_number); + // INFO("index %d valid tnr=%d\n",i,cnr); + + if(cnr == -1) overflow=1; + + if(cnr < from_tnr && !overflow) { + from = i; + from_tnr = cnr; + } + if(cnr > to_tnr) { + to = i; + to_tnr = cnr; + } + } + + if(from == -1 || to == -1) { + WARN("No usable activity log found.\n"); + + up(&mdev->md_io_mutex); + return 1; + } + + // Read the valid transactions. + // INFO("Reading from %d to %d.\n",from,to); + + /* this should better be handled by a for loop, no? + */ + i=from; + while(1) { + int j,pos; + unsigned int extent_nr; + unsigned int trn; + + rv = drbd_al_read_tr(mdev,buffer,i); + ERR_IF(rv == 0) goto cancel; + if(rv == -1) { + up(&mdev->md_io_mutex); + return 0; + } + + trn=be32_to_cpu(buffer->tr_number); + + spin_lock_irq(&mdev->al_lock); + + /* This loop runs backwards because in the cyclic + elements there might be an old version of the + updated element (in slot 0). So the element in slot 0 + can overwrite old versions. */ + for(j=AL_EXTENTS_PT;j>=0;j--) { + pos = be32_to_cpu(buffer->updates[j].pos); + extent_nr = be32_to_cpu(buffer->updates[j].extent); + + if(extent_nr == LC_FREE) continue; + + //if(j<3) INFO("T%03d S%03d=E%06d\n",trn,pos,extent_nr); + lc_set(mdev->act_log,extent_nr,pos); + active_extents++; + } + spin_unlock_irq(&mdev->al_lock); + + transactions++; + + cancel: + if( i == to) break; + i++; + if( i > mx ) i=0; + } + + mdev->al_tr_number = to_tnr+1; + mdev->al_tr_pos = to; + if( ++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements,AL_EXTENTS_PT) ) { + mdev->al_tr_pos=0; + } + + /* ok, we are done with it */ + up(&mdev->md_io_mutex); + + INFO("Found %d transactions (%d active extents) in activity log.\n", + transactions,active_extents); + + return 1; +} + +/** + * drbd_al_to_on_disk_bm: + * Writes the areas of the bitmap which are covered by the AL. + * called when we detach (unconfigure) local storage, + * or when we go from Primary to Secondary state. + */ +void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev) +{ + int i; + unsigned int enr; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + i=inc_local_md_only(mdev); + D_ASSERT( i ); // Assertions should not have side effects. + // I do not want to have D_ASSERT( inc_local_md_only(mdev) ); + + for(i=0;iact_log->nr_elements;i++) { + enr = lc_entry(mdev->act_log,i)->lc_number; + if(enr == LC_FREE) continue; + /* TODO encapsulate and optimize within drbd_bitmap + * currently, if we have al-extents 16..19 active, + * sector 4 will be written four times! */ + drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT ); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + dec_local(mdev); +} + +/** + * drbd_al_apply_to_bm: Sets the bits in the bitmap that are described + * by the active extents of the AL. + */ +void drbd_al_apply_to_bm(struct Drbd_Conf *mdev) +{ + unsigned int enr; + unsigned long add=0; + char ppb[10]; + int i; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + for(i=0;iact_log->nr_elements;i++) { + enr = lc_entry(mdev->act_log,i)->lc_number; + if(enr == LC_FREE) continue; + add += drbd_bm_ALe_set_all(mdev, enr); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + INFO("Marked additional %s as out-of-sync based on AL.\n", + ppsize(ppb,Bit2KB(add))); +} + +static inline int _try_lc_del(struct Drbd_Conf *mdev,struct lc_element *al_ext) +{ + int rv; + + spin_lock_irq(&mdev->al_lock); + rv = (al_ext->refcnt == 0); + if(likely(rv)) lc_del(mdev->act_log,al_ext); + spin_unlock_irq(&mdev->al_lock); + + if(unlikely(!rv)) INFO("Waiting for extent in drbd_al_shrink()\n"); + + return rv; +} + +/** + * drbd_al_shrink: Removes all active extents form the AL. (but does not + * write any transactions) + * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() + */ +void drbd_al_shrink(struct Drbd_Conf *mdev) +{ + struct lc_element *al_ext; + int i; + + D_ASSERT( test_bit(__LC_DIRTY,&mdev->act_log->flags) ); + + for(i=0;iact_log->nr_elements;i++) { + al_ext = lc_entry(mdev->act_log,i); + if(al_ext->lc_number == LC_FREE) continue; + wait_event(mdev->al_wait, _try_lc_del(mdev,al_ext)); + } + + wake_up(&mdev->al_wait); +} + +STATIC int w_update_odbm(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct update_odbm_work *udw = (struct update_odbm_work*)w; + + if( !inc_local_md_only(mdev) ) { + if (DRBD_ratelimit(5*HZ,5)) + WARN("Can not update on disk bitmap, local IO disabled.\n"); + return 1; + } + + drbd_bm_write_sect(mdev, udw->enr ); + dec_local(mdev); + + kfree(udw); + + if(drbd_bm_total_weight(mdev) == 0 && + ( mdev->cstate == SyncSource || mdev->cstate == SyncTarget || + mdev->cstate == PausedSyncS || mdev->cstate == PausedSyncT ) ) { + D_ASSERT( mdev->resync_work.cb == w_resync_inactive ); + drbd_bm_lock(mdev); + drbd_resync_finished(mdev); + drbd_bm_unlock(mdev); + } + + return 1; +} + + +/* ATTENTION. The AL's extents are 4MB each, while the extents in the * + * resync LRU-cache are 16MB each. * + * + * TODO will be obsoleted once we have a caching lru of the on disk bitmap + */ +STATIC void drbd_try_clear_on_disk_bm(struct Drbd_Conf *mdev,sector_t sector, + int cleared) +{ + struct list_head *le, *tmp; + struct bm_extent* ext; + struct update_odbm_work * udw; + + unsigned int enr; + + MUST_HOLD(&mdev->al_lock); + + // I simply assume that a sector/size pair never crosses + // a 16 MB extent border. (Currently this is true...) + enr = BM_SECT_TO_EXT(sector); + + ext = (struct bm_extent *) lc_get(mdev->resync,enr); + if (ext) { + if( ext->lce.lc_number == enr) { + ext->rs_left -= cleared; + if (ext->rs_left < 0) { + ERR("BAD! sector=%lu enr=%u rs_left=%d cleared=%d\n", + (unsigned long)sector, + ext->lce.lc_number, ext->rs_left, cleared); + // FIXME brrrgs. should never happen! + _set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return; + } + } else { + //WARN("Recounting sectors in %d (resync LRU too small?)\n", enr); + // This element should be in the cache + // since drbd_rs_begin_io() pulled it already in. + int rs_left = drbd_bm_e_weight(mdev,enr); + if (ext->flags != 0) { + WARN("changing resync lce: %d[%u;%02lx]" + " -> %d[%u;00]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags, enr, rs_left); + ext->flags = 0; + } + ext->rs_left = rs_left; + lc_changed(mdev->resync,&ext->lce); + } + lc_put(mdev->resync,&ext->lce); + // no race, we are within the al_lock! + } else { + ERR("lc_get() failed! locked=%d/%d flags=%lu\n", + atomic_read(&mdev->resync_locked), + mdev->resync->nr_elements, + mdev->resync->flags); + } + + list_for_each_safe(le,tmp,&mdev->resync->lru) { + ext=(struct bm_extent *)list_entry(le,struct lc_element,list); + if(ext->rs_left == 0) { + udw=kmalloc(sizeof(*udw),GFP_ATOMIC); + if(!udw) { + WARN("Could not kmalloc an udw\n"); + break; + } + udw->enr = ext->lce.lc_number; + udw->w.cb = w_update_odbm; + drbd_queue_work_front(mdev,&mdev->data.work,&udw->w); + if (ext->flags != 0) { + WARN("deleting resync lce: %d[%u;%02lx]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags); + ext->flags = 0; + } + lc_del(mdev->resync,&ext->lce); + } + } +} + +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on SyncTarget and receiver on SyncSource. + * + */ +void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr,ebnr,lbnr,bnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int strange_state,wake_up=0; + + strange_state = (mdev->cstate <= Connected) || + test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags); + if (strange_state) { + ERR("%s:%d: %s flags=0x%02lx\n", file , line , + cstate_to_name(mdev->cstate), mdev->flags); + } + + if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) { + ERR("drbd_set_in_sync: sector=%lu size=%d nonsense!\n", + (unsigned long)sector,size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we clear it (in sync). + * round up start sector, round down end sector. we make sure we only + * clear full, alligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) { + return; + } else if (unlikely(esector == (nr_sectors-1))) { + ebnr = lbnr; + } else { + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + } + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + +#ifdef DUMP_EACH_PACKET + INFO("drbd_set_in_sync: sector=%lu size=%d sbnr=%lu ebnr=%lu\n", + (unsigned long)sector, size, sbnr, ebnr); +#endif + + if (sbnr > ebnr) return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irq(&mdev->al_lock); + for(bnr=sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_clear_bit(mdev,bnr)) count++; + } + if (count) { + // we need the lock for drbd_try_clear_on_disk_bm + if(jiffies - mdev->rs_mark_time > HZ*10) { + /* should be roling marks, but we estimate only anyways. */ + if( mdev->rs_mark_left != drbd_bm_total_weight(mdev)) { + mdev->rs_mark_time =jiffies; + mdev->rs_mark_left =drbd_bm_total_weight(mdev); + } + } + drbd_try_clear_on_disk_bm(mdev,sector,count); + /* just wake_up unconditional now, + * various lc_chaged(), lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up=1; + } + spin_unlock_irq(&mdev->al_lock); + if(wake_up) wake_up(&mdev->al_wait); +} + +/* + * this is intended to set one request worth of data out of sync. + * affects at least 1 bit, and at most 1+PAGE_SIZE/BM_BLOCK_SIZE bits. + * + * called by tl_clear and drbd_send_dblock (==drbd_make_request). + * so this can be _any_ process. + */ +void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line) +{ + unsigned long sbnr,ebnr,lbnr,bnr; + sector_t esector, nr_sectors; + int strange_state; + + strange_state = ( mdev->cstate > Connected ) || + ( mdev->cstate == Connected && + !(test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags)) ); + if (strange_state) { + ERR("%s:%d: %s flags=0x%02lx\n", file , line , + cstate_to_name(mdev->cstate), mdev->flags); + } + + if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) { + ERR("sector: %lu, size: %d\n",(unsigned long)sector,size); + return; + } + + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we set it out of sync, + * we do not need to round anything here */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + for(bnr=sbnr; bnr <= ebnr; bnr++) drbd_bm_set_bit(mdev,bnr); +} + +static inline +struct bm_extent* _bme_get(struct Drbd_Conf *mdev, unsigned int enr) +{ + struct bm_extent *bm_ext; + int wakeup = 0; + unsigned long rs_flags; + + if(atomic_read(&mdev->resync_locked) > mdev->resync->nr_elements-3 ) { + //WARN("bme_get() does not lock all elements\n"); + return 0; + } + + spin_lock_irq(&mdev->al_lock); + bm_ext = (struct bm_extent*) lc_get(mdev->resync,enr); + if (bm_ext) { + if(bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev,enr); + lc_changed(mdev->resync,(struct lc_element*)bm_ext); + wakeup = 1; + } + if(bm_ext->lce.refcnt == 1) atomic_inc(&mdev->resync_locked); + set_bit(BME_NO_WRITES,&bm_ext->flags); // within the lock + } + rs_flags=mdev->resync->flags; + spin_unlock_irq(&mdev->al_lock); + if (wakeup) wake_up(&mdev->al_wait); + + if(!bm_ext) { + if (rs_flags & LC_STARVING) { + WARN("Have to wait for element" + " (resync LRU too small?)\n"); + } + if (rs_flags & LC_DIRTY) { + BUG(); // WARN("Ongoing RS update (???)\n"); + } + } + + return bm_ext; +} + +static inline int _is_in_al(drbd_dev* mdev, unsigned int enr) +{ + struct lc_element* al_ext; + int rv=0; + + spin_lock_irq(&mdev->al_lock); + if(unlikely(enr == mdev->act_log->new_number)) rv=1; + else { + al_ext = lc_find(mdev->act_log,enr); + if(al_ext) { + if (al_ext->refcnt) rv=1; + } + } + spin_unlock_irq(&mdev->al_lock); + + /* + if(unlikely(rv)) { + INFO("Delaying sync read until app's write is done\n"); + } + */ + return rv; +} + +/** + * drbd_rs_begin_io: Gets an extent in the resync LRU cache and sets it + * to BME_LOCKED. + * + * @sector: The sector number + */ +int drbd_rs_begin_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent* bm_ext; + int i, sig; + + sig = wait_event_interruptible( mdev->al_wait, + (bm_ext = _bme_get(mdev,enr)) ); + if (sig) return 0; + + if(test_bit(BME_LOCKED,&bm_ext->flags)) return 1; + + for(i=0;ial_wait, + !_is_in_al(mdev,enr*AL_EXT_PER_BM_SECT+i) ); + if (sig) { + spin_lock_irq(&mdev->al_lock); + if( lc_put(mdev->resync,&bm_ext->lce) == 0 ) { + clear_bit(BME_NO_WRITES,&bm_ext->flags); + atomic_dec(&mdev->resync_locked); + wake_up(&mdev->al_wait); + } + spin_unlock_irq(&mdev->al_lock); + return 0; + } + } + + set_bit(BME_LOCKED,&bm_ext->flags); + + return 1; +} + +void drbd_rs_complete_io(drbd_dev* mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent* bm_ext; + unsigned long flags; + + spin_lock_irqsave(&mdev->al_lock,flags); + bm_ext = (struct bm_extent*) lc_find(mdev->resync,enr); + if(!bm_ext) { + spin_unlock_irqrestore(&mdev->al_lock,flags); + ERR("drbd_rs_complete_io() called, but extent not found\n"); + return; + } + + if( lc_put(mdev->resync,(struct lc_element *)bm_ext) == 0 ) { + clear_bit(BME_LOCKED,&bm_ext->flags); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + atomic_dec(&mdev->resync_locked); + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock,flags); +} + +/** + * drbd_rs_cancel_all: Removes extents from the resync LRU. Even + * if they are BME_LOCKED. + */ +void drbd_rs_cancel_all(drbd_dev* mdev) +{ + struct bm_extent* bm_ext; + int i; + + spin_lock_irq(&mdev->al_lock); + + for(i=0;iresync->nr_elements;i++) { + bm_ext = (struct bm_extent*) lc_entry(mdev->resync,i); + if(bm_ext->lce.lc_number == LC_FREE) continue; + bm_ext->lce.refcnt = 0; // Rude but ok. + bm_ext->rs_left = 0; + clear_bit(BME_LOCKED,&bm_ext->flags); + clear_bit(BME_NO_WRITES,&bm_ext->flags); + lc_del(mdev->resync,&bm_ext->lce); + } + atomic_set(&mdev->resync_locked,0); + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_bitmap.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,980 @@ +/* +-*- linux-c -*- + drbd_bitmap.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 2004, Lars Ellenberg . + main author. + + Copyright (C) 2004, Philipp Reisner . + contributions. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include // for memset + +#include +#include "drbd_int.h" + +/* special handling for ppc64 on 2.4 kernel -- find_next_bit is not exported + * so we include it here (verbatim, from linux 2.4.21 sources) */ +#if defined(__powerpc64__) && LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + +unsigned long find_next_bit(unsigned long *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = addr + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (64 - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} +#endif /* NEED_PPC64_WORKAROUND */ + +/* OPAQUE outside this file! + * interface defined in drbd_int.h + * + * unfortunately this currently means that this file is not + * yet selfcontained, because it needs to know about how to receive + * the bitmap from the peer via the data socket. + * This is to be solved with some sort of + * drbd_bm_copy(mdev,offset,size,unsigned long*) ... + + * Note that since find_first_bit returns int, this implementation + * "only" supports up to 1<<(32+12) == 16 TB... non issue, since + * currently DRBD is limited to ca 3.8 TB storage anyways. + * + * we will eventually change the implementation to not allways hold the full + * bitmap in memory, but only some 'lru_cache' of the on disk bitmap, + * since vmalloc'ing mostly unused 128M is antisocial. + + * THINK + * I'm not yet sure whether this file should be bits only, + * or wether I want it to do all the sector<->bit calculation in here. + */ + +/* + * NOTE + * Access to the *bm is protected by bm_lock. + * It is safe to read the other members within the lock. + * + * drbd_bm_set_bit is called from bio_endio callbacks, + * so there we need a spin_lock_irqsave. + * Everywhere else we need a spin_lock_irq. + * + * FIXME + * Actually you need to serialize all resize operations. + * but then, resize is a drbd state change, and it should be serialized + * already. Unfortunately it is not (yet), so two concurrent resizes, like + * attach storage (drbdsetup) and receive the peers size (drbd receiver) + * may eventually blow things up. + * Therefore, + * you may only change the other members when holding + * the bm_change mutex _and_ the bm_lock. + * thus reading them holding either is safe. + * this is sort of overkill, but I rather do it right + * than have two resize operations interfere somewhen. + */ +struct drbd_bitmap { + unsigned long *bm; + spinlock_t bm_lock; + unsigned long bm_fo; // next offset for drbd_bm_find_next + unsigned long bm_set; // nr of set bits; THINK maybe atomic_t ? + unsigned long bm_bits; + size_t bm_words; + sector_t bm_dev_capacity; + struct semaphore bm_change; // serializes resize operations + + // { REMOVE + unsigned long bm_flags; // currently debugging aid only + unsigned long bm_line; + char *bm_file; + // } +}; + +// { REMOVE once we serialize all state changes properly +#define D_BUG_ON(x) ERR_IF(x) { dump_stack(); } +#define BM_LOCKED 0 +#if 0 // simply disabled for now... +#define MUST_NOT_BE_LOCKED() do { \ + if (test_bit(BM_LOCKED,&b->bm_flags)) { \ + if (DRBD_ratelimit(5*HZ,5)) { \ + ERR("%s:%d: bitmap is locked by %s:%lu\n", \ + __FILE__, __LINE__, b->bm_file,b->bm_line); \ + dump_stack(); \ + } \ + } \ +} while (0) +#define MUST_BE_LOCKED() do { \ + if (!test_bit(BM_LOCKED,&b->bm_flags)) { \ + if (DRBD_ratelimit(5*HZ,5)) { \ + ERR("%s:%d: bitmap not locked!\n", \ + __FILE__, __LINE__); \ + dump_stack(); \ + } \ + } \ +} while (0) +#else +#define MUST_NOT_BE_LOCKED() do {(void)b;} while (0) +#define MUST_BE_LOCKED() do {(void)b;} while (0) +#endif +void __drbd_bm_lock(drbd_dev *mdev, char* file, int line) +{ + struct drbd_bitmap *b = mdev->bitmap; + spin_lock_irq(&b->bm_lock); + if (!__test_and_set_bit(BM_LOCKED,&b->bm_flags)) { + b->bm_file = file; + b->bm_line = line; + } else if (DRBD_ratelimit(5*HZ,5)) { + ERR("%s:%d: bitmap already locked by %s:%lu\n", + file, line, b->bm_file,b->bm_line); + /* + dump_stack(); + ERR("This is no oops, but debug stack trace only.\n"); + ERR("If you get this often, or in reproducable situations, " + "notify \n"); + */ + } + spin_unlock_irq(&b->bm_lock); +} +void drbd_bm_unlock(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + spin_lock_irq(&b->bm_lock); + if (!__test_and_clear_bit(BM_LOCKED,&mdev->bitmap->bm_flags)) { + ERR("bitmap not locked in bm_unlock\n"); + } else { + /* FIXME if we got a "is already locked" previously, + * we unlock here even though we actually MUST NOT do so... */ + b->bm_file = NULL; + b->bm_line = -1; + } + spin_unlock_irq(&b->bm_lock); +} + +#if 0 +// has been very helpful to indicate that rs_total and rs_left have been +// used in a non-smp safe way... +#define BM_PARANOIA_CHECK() do { \ + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); \ + D_ASSERT(b->bm_dev_capacity == drbd_get_capacity(mdev->this_bdev)); \ + if ( (b->bm_set != mdev->rs_total) && \ + (b->bm_set != mdev->rs_left) ) { \ + if ( DRBD_ratelimit(5*HZ,5) ) { \ + ERR("%s:%d: ?? bm_set=%lu; rs_total=%lu, rs_left=%lu\n",\ + __FILE__ , __LINE__ , \ + b->bm_set, mdev->rs_total, mdev->rs_left ); \ + } \ + } \ +} while (0) +#else +#define BM_PARANOIA_CHECK() do { \ + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); \ + D_ASSERT(b->bm_dev_capacity == drbd_get_capacity(mdev->this_bdev)); \ +} while (0) +#endif +// } + +#if DUMP_MD >= 3 +/* debugging aid */ +STATIC void bm_end_info(drbd_dev *mdev, const char* where) +{ + struct drbd_bitmap *b = mdev->bitmap; + size_t w = (b->bm_bits-1) >> LN2_BPL; + + INFO("%s: bm_set=%lu\n", where, b->bm_set); + INFO("bm[%d]=0x%lX\n", w, b->bm[w]); + w++; + + if ( w < b->bm_words ) { + D_ASSERT(w == b->bm_words -1); + INFO("bm[%d]=0x%lX\n",w,b->bm[w]); + } +} +#else +#define bm_end_info(ignored...) ((void)(0)) +#endif + +/* long word offset of _bitmap_ sector */ +#define S2W(s) ((s)<<(BM_EXT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL)) + +/* + * actually most functions herein should take a struct drbd_bitmap*, not a + * drbd_dev*, but for the debug macros I like to have the mdev around + * to be able to report device specific. + */ + +/* FIXME TODO sometimes I use "int offset" as index into the bitmap. + * since we currently are LIMITED to (128<<11)-64-8 sectors of bitmap, + * this is ok [as long as we dont run on a 24 bit arch :)]. + * But it is NOT strictly ok. + */ + +/* + * called on driver init only. TODO call when a device is created. + * allocates the drbd_bitmap, and stores it in mdev->bitmap. + */ +int drbd_bm_init(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + D_BUG_ON(b); + b = kmalloc(sizeof(struct drbd_bitmap),GFP_KERNEL); + if (!b) + return -ENOMEM; + memset(b,0,sizeof(*b)); + b->bm_lock = SPIN_LOCK_UNLOCKED; + init_MUTEX(&b->bm_change); + mdev->bitmap = b; + return 0; +} + +sector_t drbd_bm_capacity(drbd_dev *mdev) +{ + ERR_IF(!mdev->bitmap) return 0; + return mdev->bitmap->bm_dev_capacity; +} + +/* called on driver unload. TODO: call when a device is destroyed. + */ +void drbd_bm_cleanup(drbd_dev *mdev) +{ + ERR_IF (!mdev->bitmap) return; + /* FIXME I think we should explicitly change the device size to zero + * before this... + * + D_BUG_ON(mdev->bitmap->bm); + */ + vfree(mdev->bitmap->bm); + kfree(mdev->bitmap); + mdev->bitmap = NULL; +} + +/* + * since (b->bm_bits % BITS_PER_LONG) != 0, + * this masks out the remaining bits. + * Rerturns the number of bits cleared. + */ +STATIC int bm_clear_surplus(struct drbd_bitmap * b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1; + size_t w = b->bm_bits >> LN2_BPL; + int cleared=0; + + if ( w < b->bm_words ) { + cleared = hweight_long(b->bm[w] & ~mask); + b->bm[w++] &= mask; + } + + if ( w < b->bm_words ) { + cleared += hweight_long(b->bm[w]); + b->bm[w++]=0; + } + + return cleared; +} + +STATIC void bm_set_surplus(struct drbd_bitmap * b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) -1; + size_t w = b->bm_bits >> LN2_BPL; + + if ( w < b->bm_words ) { + b->bm[w++] |= ~mask; + } + + if ( w < b->bm_words ) { + b->bm[w++] = ~(0UL); + } +} + +STATIC unsigned long bm_count_bits(struct drbd_bitmap * b) +{ + unsigned long *bm = b->bm; + unsigned long *ep = b->bm + b->bm_words; + unsigned long bits = 0; + + while ( bm < ep ) { + bits += hweight_long(*bm++); + } + + return bits; +} + +#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) + +/* + * make sure the bitmap has enough room for the attached storage, + * if neccessary, resize. + * called whenever we may have changed the device size. + * returns -ENOMEM if we could not allocate enough memory, 0 on success. + * In case this is actually a resize, we copy the old bitmap into the new one. + * Otherwise, the bitmap is initiallized to all bits set. + */ +int drbd_bm_resize(drbd_dev *mdev, sector_t capacity) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long bits, bytes, words, *nbm, *obm = 0; + int err = 0, growing; + + ERR_IF(!b) return -ENOMEM; + MUST_BE_LOCKED(); + + ERR_IF (down_trylock(&b->bm_change)) { + down(&b->bm_change); + } + + if (capacity == b->bm_dev_capacity) + goto out; + + if (capacity == 0) { + spin_lock_irq(&b->bm_lock); + obm = b->bm; + b->bm = NULL; + b->bm_fo = + b->bm_set = + b->bm_bits = + b->bm_words = + b->bm_dev_capacity = 0; + spin_unlock_irq(&b->bm_lock); + goto free_obm; + } else { + bits = ALIGN(capacity,BM_SECTORS_PER_BIT) + >> (BM_BLOCK_SIZE_B-9); + + /* if we would use + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; + a 32bit host could present the wrong number of words + to a 64bit host. + */ + words = ALIGN(bits,64) >> LN2_BPL; + + D_ASSERT(bits < ((MD_RESERVED_SIZE<<1)-MD_BM_OFFSET)<<12 ); + + if ( words == b->bm_words ) { + /* optimize: capacity has changed, + * but only within one long word worth of bits. + * just update the bm_dev_capacity and bm_bits members. + */ + spin_lock_irq(&b->bm_lock); + b->bm_bits = bits; + b->bm_dev_capacity = capacity; + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + spin_unlock_irq(&b->bm_lock); + goto out; + } else { + /* one extra long to catch off by one errors */ + bytes = (words+1)*sizeof(long); + nbm = vmalloc(bytes); + if (!nbm) { + err = -ENOMEM; + goto out; + } + } + spin_lock_irq(&b->bm_lock); + obm = b->bm; + // brgs. move several MB within spinlock... + if (obm) { + bm_set_surplus(b); + D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); + memcpy(nbm,obm,min_t(size_t,b->bm_words,words)*sizeof(long)); + } + growing = words > b->bm_words; + if (growing) { // set all newly allocated bits + memset( nbm+b->bm_words, -1, + (words - b->bm_words) * sizeof(long) ); + b->bm_set += bits - b->bm_bits; + } + nbm[words] = DRBD_MAGIC; + b->bm = nbm; + b->bm_bits = bits; + b->bm_words = words; + b->bm_dev_capacity = capacity; + bm_clear_surplus(b); + if( !growing ) b->bm_set = bm_count_bits(b); + bm_end_info(mdev, __FUNCTION__ ); + spin_unlock_irq(&b->bm_lock); + INFO("resync bitmap: bits=%lu words=%lu\n",bits,words); + } + free_obm: + vfree(obm); // vfree(NULL) is noop + out: + up(&b->bm_change); + return err; +} + +/* inherently racy: + * if not protected by other means, return value may be out of date when + * leaving this function... + * we still need to lock it, since it is important that this returns + * bm_set == 0 precisely. + * + * maybe bm_set should be atomic_t ? + */ +unsigned long drbd_bm_total_weight(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long s; + unsigned long flags; + + ERR_IF(!b) return 0; + // MUST_BE_LOCKED(); well. yes. but ... + + spin_lock_irqsave(&b->bm_lock,flags); + s = b->bm_set; + spin_unlock_irqrestore(&b->bm_lock,flags); + + return s; +} + +size_t drbd_bm_words(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return 0; + + /* FIXME + * actually yes. really. otherwise it could just change its size ... + * but it triggers all the time... + * MUST_BE_LOCKED(); + */ + + return b->bm_words; +} + +/* merge number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + */ +void drbd_bm_merge_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + unsigned long word, bits; + size_t n = number; + + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + D_BUG_ON(offset >= b->bm_words); + D_BUG_ON(offset+number > b->bm_words); + D_BUG_ON(number > PAGE_SIZE/sizeof(long)); + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + // BM_PARANOIA_CHECK(); no. + bm = b->bm + offset; + while(n--) { + bits = hweight_long(*bm); + word = *bm | lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (offset+number == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + } + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + */ +void drbd_bm_set_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + unsigned long word, bits; + size_t n = number; + + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + D_BUG_ON(offset >= b->bm_words); + D_BUG_ON(offset+number > b->bm_words); + D_BUG_ON(number > PAGE_SIZE/sizeof(long)); + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + // BM_PARANOIA_CHECK(); no. + bm = b->bm + offset; + while(n--) { + bits = hweight_long(*bm); + word = lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (offset+number == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + bm_end_info(mdev, __FUNCTION__ ); + } + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from the bitmap starting at offset into the buffer. + * buffer[i] will be little endian unsigned long. + */ +void drbd_bm_get_lel( drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer ) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *bm; + + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + if ( (offset >= b->bm_words) || + (offset+number > b->bm_words) || + (number > PAGE_SIZE/sizeof(long)) || + (number <= 0) ) { + // yes, there is "%z", but that gives compiler warnings... + ERR("offset=%lu number=%lu bm_words=%lu\n", + (unsigned long) offset, + (unsigned long) number, + (unsigned long) b->bm_words); + return; + } + + // MUST_BE_LOCKED(); yes. but not neccessarily globally... + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + bm = b->bm + offset; + while(number--) *buffer++ = cpu_to_lel(*bm++); + spin_unlock_irq(&b->bm_lock); +} + +/* set all bits in the bitmap */ +void drbd_bm_set_all(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + memset(b->bm,-1,b->bm_words*sizeof(long)); + bm_clear_surplus(b); + b->bm_set = b->bm_bits; + spin_unlock_irq(&b->bm_lock); +} + +/* read one sector of the on disk bitmap into memory. + * on disk bitmap is little endian. + * @enr is _sector_ offset from start of on disk bitmap (aka bm-extent nr). + * returns 0 on success, -EIO on failure + */ +int drbd_bm_read_sect(drbd_dev *mdev,unsigned long enr) +{ + sector_t on_disk_sector = enr + drbd_md_ss(mdev) + MD_BM_OFFSET; + int bm_words, num_words, offset, err = 0; + + // MUST_BE_LOCKED(); not neccessarily global ... + + down(&mdev->md_io_mutex); + if(drbd_md_sync_page_io(mdev,on_disk_sector,READ)) { + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); // word offset into bitmap + num_words = min(S2W(1), bm_words - offset); +#if DUMP_MD >= 3 + INFO("read_sect: sector=%lu offset=%u num_words=%u\n", + enr, offset, num_words); +#endif + drbd_bm_set_lel( mdev, offset, num_words, + page_address(mdev->md_io_page) ); + } else { + int i; + err = -EIO; + ERR( "IO ERROR reading bitmap sector %lu " + "(meta-disk sector %lu)\n", + enr, (unsigned long)on_disk_sector ); + drbd_chk_io_error(mdev, 1); + drbd_io_error(mdev); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i); + } + up(&mdev->md_io_mutex); + return err; +} + +/** + * drbd_bm_read: Read the whole bitmap from its on disk location. + */ +void drbd_bm_read(struct Drbd_Conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + sector_t sector; + int bm_words, num_sectors; + char ppb[10]; + + MUST_BE_LOCKED(); + + bm_words = drbd_bm_words(mdev); + num_sectors = (bm_words*sizeof(long) + 511) >> 9; + + for (sector = 0; sector < num_sectors; sector++) { + // FIXME do something on io error here? + drbd_bm_read_sect(mdev,sector); + } + + INFO("%s marked out-of-sync by on disk bit-map.\n", + ppsize(ppb,drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10)) ); +} + +/** + * drbd_bm_write_sect: Writes a 512 byte piece of the bitmap to its + * on disk location. On disk bitmap is little endian. + * + * @enr: The _sector_ offset from the start of the bitmap. + * + */ +int drbd_bm_write_sect(struct Drbd_Conf *mdev,unsigned long enr) +{ + sector_t on_disk_sector = enr + drbd_md_ss(mdev) + MD_BM_OFFSET; + int bm_words, num_words, offset, err = 0; + + // MUST_BE_LOCKED(); not neccessarily global... + + down(&mdev->md_io_mutex); + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); // word offset into bitmap + num_words = min(S2W(1), bm_words - offset); +#if DUMP_MD >= 3 + INFO("write_sect: sector=%lu offset=%u num_words=%u\n", + enr, offset, num_words); +#endif + if (num_words < S2W(1)) { + memset(page_address(mdev->md_io_page),0,MD_HARDSECT); + } + drbd_bm_get_lel( mdev, offset, num_words, + page_address(mdev->md_io_page) ); + if (!drbd_md_sync_page_io(mdev,on_disk_sector,WRITE)) { + int i; + err = -EIO; + ERR( "IO ERROR writing bitmap sector %lu " + "(meta-disk sector %lu)\n", + enr, (unsigned long)on_disk_sector ); + drbd_chk_io_error(mdev, 1); + drbd_io_error(mdev); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev,enr*AL_EXT_PER_BM_SECT+i); + } + mdev->bm_writ_cnt++; + up(&mdev->md_io_mutex); + return err; +} + +/** + * drbd_bm_write: Write the whole bitmap to its on disk location. + */ +void drbd_bm_write(struct Drbd_Conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + sector_t sector; + int bm_words, num_sectors; + + MUST_BE_LOCKED(); + + bm_words = drbd_bm_words(mdev); + num_sectors = (bm_words*sizeof(long) + 511) >> 9; + + for (sector = 0; sector < num_sectors; sector++) { + // FIXME do something on io error here? + drbd_bm_write_sect(mdev,sector); + } + + INFO("%lu KB now marked out-of-sync by on disk bit-map.\n", + drbd_bm_total_weight(mdev) << (BM_BLOCK_SIZE_B-10) ); +} + +/* clear all bits in the bitmap */ +void drbd_bm_clear_all(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm) return; + + MUST_BE_LOCKED(); \ + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + memset(b->bm,0,b->bm_words*sizeof(long)); + b->bm_set = 0; + spin_unlock_irq(&b->bm_lock); +} + +void drbd_bm_reset_find(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + b->bm_fo = 0; + spin_unlock_irq(&b->bm_lock); + +} + +/* NOTE + * find_first_bit returns int, we return unsigned long. + * should not make much difference anyways, but ... + * this returns a bit number, NOT a sector! + */ +unsigned long drbd_bm_find_next(drbd_dev *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long i = -1UL; + ERR_IF(!b) return i; + ERR_IF(!b->bm) return i; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + if (b->bm_fo < b->bm_bits) { + i = find_next_bit(b->bm,b->bm_bits,b->bm_fo); + } else if (b->bm_fo > b->bm_bits) { + ERR("bm_fo=%lu bm_bits=%lu\n",b->bm_fo, b->bm_bits); + } + if (i >= b->bm_bits) { + i = -1UL; + b->bm_fo = 0; + } else { + b->bm_fo = i+1; + } + spin_unlock_irq(&b->bm_lock); + return i; +} + +int drbd_bm_rs_done(drbd_dev *mdev) +{ + return mdev->bitmap->bm_fo == 0; +} + +// THINK maybe the D_BUG_ON(i<0)s in set/clear/test should be not that strict? + +/* returns previous bit state + * wants bitnr, NOT sector. + */ +int drbd_bm_set_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int i; + ERR_IF(!b) return 1; + ERR_IF(!b->bm) return 1; + +/* + * only called from drbd_set_out_of_sync. + * strange_state blubber is already in place there... + strange_state = ( mdev->cstate > Connected ) || + ( mdev->cstate == Connected && + !(test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags)) ); + if (strange_state) + ERR("%s in drbd_bm_set_bit\n", cstate_to_name(mdev->cstate)); +*/ + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } else { + i = (0 != __test_and_set_bit(bitnr, b->bm)); + b->bm_set += !i; + } + spin_unlock_irq(&b->bm_lock); + return i; +} + +/* returns previous bit state + * wants bitnr, NOT sector. + */ +int drbd_bm_clear_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long flags; + int i; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + spin_lock_irqsave(&b->bm_lock,flags); + BM_PARANOIA_CHECK(); + MUST_NOT_BE_LOCKED(); + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } else { + i = (0 != __test_and_clear_bit(bitnr, b->bm)); + b->bm_set -= i; + } + spin_unlock_irqrestore(&b->bm_lock,flags); + + /* clearing bits should only take place when sync is in progress! + * this is only called from drbd_set_in_sync. + * strange_state blubber is already in place there ... + if (i && mdev->cstate <= Connected) + ERR("drbd_bm_clear_bit: cleared a bitnr=%lu while %s\n", + bitnr, cstate_to_name(mdev->cstate)); + */ + + return i; +} + +/* returns bit state + * wants bitnr, NOT sector. + * inherently racy... area needs to be locked by means of {al,rs}_lru + */ +int drbd_bm_test_bit(drbd_dev *mdev, const unsigned long bitnr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int i; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + ERR_IF (bitnr >= b->bm_bits) { + ERR("bitnr=%lu bm_bits=%lu\n",bitnr, b->bm_bits); + i = 0; + } else { + i = test_bit(bitnr, b->bm); + } + spin_unlock_irq(&b->bm_lock); + return i; +} + +/* inherently racy... + * return value may be already out-of-date when this function returns. + * but the general usage is that this is only use during a cstate when bits are + * only cleared, not set, and typically only care for the case when the return + * value is zero, or we already "locked" this "bitmap extent" by other means. + * + * enr is bm-extent number, since we chose to name one sector (512 bytes) + * worth of the bitmap a "bitmap extent". + * + * TODO + * I think since we use it like a reference count, we should use the real + * reference count of some bitmap extent element from some lru instead... + * + */ +int drbd_bm_e_weight(drbd_dev *mdev, unsigned long enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int count, s, e; + unsigned long flags; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + spin_lock_irqsave(&b->bm_lock,flags); + BM_PARANOIA_CHECK(); + + s = S2W(enr); + e = min((size_t)S2W(enr+1),b->bm_words); + count = 0; + if (s < b->bm_words) { + const unsigned long* w = b->bm+s; + int n = e-s; + while (n--) count += hweight_long(*w++); + } else { + ERR("start offset (%d) too large in drbd_bm_e_weight\n", s); + } + spin_unlock_irqrestore(&b->bm_lock,flags); +#if DUMP_MD >= 3 + INFO("enr=%lu weight=%d e=%d s=%d\n", enr, count, e, s); +#endif + return count; +} + +/* set all bits covered by the AL-extent al_enr */ +unsigned long drbd_bm_ALe_set_all(drbd_dev *mdev, unsigned long al_enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long weight; + int count, s, e; + ERR_IF(!b) return 0; + ERR_IF(!b->bm) return 0; + + MUST_BE_LOCKED(); + + spin_lock_irq(&b->bm_lock); + BM_PARANOIA_CHECK(); + weight = b->bm_set; + + s = al_enr * BM_WORDS_PER_AL_EXT; + e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); + count = 0; + if (s < b->bm_words) { + const unsigned long* w = b->bm+s; + int n = e-s; + while (n--) count += hweight_long(*w++); + n = e-s; + memset(b->bm+s,-1,n*sizeof(long)); + b->bm_set += n*BITS_PER_LONG - count; + if (e == b->bm_words) { + b->bm_set -= bm_clear_surplus(b); + } + } else { + ERR("start offset (%d) too large in drbd_bm_ALe_set_all\n", s); + } + weight = b->bm_set - weight; + spin_unlock_irq(&b->bm_lock); + return weight; +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_buildtag.c 2006-06-21 17:15:45.000000000 +0400 @@ -0,0 +1,6 @@ +/* automatically generated. DO NOT EDIT. */ +const char * drbd_buildtag(void) +{ + return "SVN Revision: 2093" + " build by phil@mescal, 2006-03-06 15:04:12"; +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_compat_types.h 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,324 @@ + +// currently only abstraction layer to get all references to buffer_head +// and b_some_thing out of our .c files. + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +#include + +typedef struct buffer_head drbd_bio_t; +typedef unsigned long sector_t; + +#define NOT_IN_26(x...) x +#define ONLY_IN_26(x...) + +#if !defined(CONFIG_HIGHMEM) && !defined(bh_kmap) +#define bh_kmap(bh) ((bh)->b_data) +#define bh_kunmap(bh) do { } while (0) +#endif + +#ifndef list_for_each +#define list_for_each(pos, head) \ + for(pos = (head)->next; pos != (head); pos = pos->next) +#endif + +// RH 2.4.9 does not have min() / max() +#ifndef min +# define min(x,y) \ + ({ typeof(x) __x = (x); typeof(y) __y = (y); \ + (void)(&__x == &__y); \ + __x < __y ? __x: __y; }) +#endif + +#ifndef max +# define max(x,y) \ + ({ typeof(x) __x = (x); typeof(y) __y = (y); \ + (void)(&__x == &__y); \ + __x > __y ? __x: __y; }) +#endif + +#ifndef MODULE_LICENSE +# define MODULE_LICENSE(L) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,10) +#define min_t(type,x,y) \ + ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) +#define max_t(type,x,y) \ + ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,7) +#define completion semaphore +#define init_completion(A) init_MUTEX_LOCKED(A) +#define wait_for_completion(A) down(A) +#define complete(A) up(A) +#else +#include +#endif + +/* note that if you use some verndor kernels like SuSE, + * their 2.4.X variant probably already contain equivalent definitions. + * you then have to disable this compat again... + */ + +#ifndef HAVE_FIND_NEXT_BIT /* { */ + +#if defined(__i386__) || defined(__arch_um__) +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +static __inline__ int find_first_bit(const unsigned long *addr, unsigned size) +{ + int d0, d1; + int res; + + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ + __asm__ __volatile__( + "xorl %%eax,%%eax\n\t" + "repe; scasl\n\t" + "jz 1f\n\t" + "leal -4(%%edi),%%edi\n\t" + "bsfl (%%edi),%%eax\n" + "1:\tsubl %%ebx,%%edi\n\t" + "shll $3,%%edi\n\t" + "addl %%edi,%%eax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); + return res; +} + +/** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ + +static __inline__ int find_next_bit(const unsigned long *addr, int size, int offset) +{ + const unsigned long *p = addr + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for nonzero in the first 32 bits: + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (*p >> bit)); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - addr)); + return (offset + set + res); +} + +#elif defined(__x86_64__) + +static __inline__ int find_first_bit(const unsigned long * addr, unsigned size) +{ + int d0, d1; + int res; + + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ + __asm__ __volatile__( + "xorl %%eax,%%eax\n\t" + "repe; scasl\n\t" + "jz 1f\n\t" + "leaq -4(%%rdi),%%rdi\n\t" + "bsfl (%%rdi),%%eax\n" + "1:\tsubq %%rbx,%%rdi\n\t" + "shll $3,%%edi\n\t" + "addl %%edi,%%eax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); + return res; +} + +static __inline__ int find_next_bit(const unsigned long * addr, int size, int offset) +{ + const unsigned long * p = addr + (offset >> 6); + unsigned long set = 0, bit = offset & 63, res; + + if (bit) { + /* + * Look for nonzero in the first 64 bits: + */ + __asm__("bsfq %1,%0\n\t" + "cmoveq %2,%0\n\t" + : "=r" (set) + : "r" (*p >> bit), "r" (64L)); + if (set < (64 - bit)) + return set + offset; + set = 64 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 64 * (p - addr)); + return (offset + set + res); +} + +#elif defined(__alpha__) + +#include +#if __GNUC__ == 3 && __GNUC_MINOR__ >= 4 || __GNUC__ > 3 +# define __kernel_cmpbge(a, b) __builtin_alpha_cmpbge(a, b) +#else +# define __kernel_cmpbge(a, b) \ + ({ unsigned long __kir; \ + __asm__("cmpbge %r2,%1,%0" : "=r"(__kir) : "rI"(b), "rJ"(a)); \ + __kir; }) +#endif + +static inline unsigned long __ffs(unsigned long word) +{ +#if defined(__alpha_cix__) && defined(__alpha_fix__) + /* Whee. EV67 can calculate it directly. */ + return __kernel_cttz(word); +#else + unsigned long bits, qofs, bofs; + + bits = __kernel_cmpbge(0, word); + qofs = ffz_b(bits); + bits = __kernel_extbl(word, qofs); + bofs = ffz_b(~bits); + + return qofs*8 + bofs; +#endif +} + +static inline unsigned long +find_next_bit(void * addr, unsigned long size, unsigned long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + found_first: + tmp &= ~0UL >> (64 - size); + if (!tmp) + return result + size; + found_middle: + return result + __ffs(tmp); +} +#elif defined(USE_GENERIC_FIND_NEXT_BIT) + +#if BITS_PER_LONG == 32 +#define _xFFFF 31ul +#define _x10000 32 +#define _xSHIFT 5 +#elif BITS_PER_LONG == 64 +#define _xFFFF 63ul +#define _x10000 64 +#define _xSHIFT 6 +#else +#error "Unexpected BITS_PER_LONG" +#endif + +/* slightly large to be inlined, but anyways... */ +static inline unsigned long +find_next_bit(void * addr, unsigned long size, unsigned long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> _xSHIFT); + unsigned long result = offset & ~_xFFFF; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= _xFFFF; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < _x10000) + goto found_first; + if (tmp) + goto found_middle; + size -= _x10000; + result += _x10000; + } + while (size & ~_xFFFF) { + if ((tmp = *(p++))) + goto found_middle; + result += _x10000; + size -= _x10000; + } + if (!size) + return result; + tmp = *p; + found_first: + tmp &= ~0UL >> (_x10000 - size); + if (!tmp) + return result + size; + found_middle: /* if this is reached, we know that (tmp != 0) */ + return result + generic_ffs(tmp)-1; +} + +#undef _xFFFF +#undef _x10000 +#undef _xSHIFT + +#elif !defined(__powerpc64__) /* ppc64 is taken care of, see drbd_bitmap.c */ +#warning "You probably need to copy find_next_bit() from a 2.6.x kernel." +#warning "Or enable low performance generic C-code" +#warning "(USE_GENERIC_FIND_NEXT_BIT in drbd_config.h)" +#endif + +#endif /* HAVE_FIND_NEXT_BIT } */ + +#ifndef ALIGN +#define ALIGN(x,a) ( ((x) + (a)-1) &~ ((a)-1) ) +#endif + +#ifndef BUG_ON +#define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0) +#endif + +#else // LINUX 2.6 + +typedef struct bio drbd_bio_t; + +#define SIGHAND_HACK + +#define NOT_IN_26(x...) +#define ONLY_IN_26(x...) x + +#endif --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_compat_wrappers.h 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,653 @@ +// currently only abstraction layer to get all references to buffer_head +// and b_some_thing out of our .c files. + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + +#define __module_get __MOD_INC_USE_COUNT +#define module_put __MOD_DEC_USE_COUNT + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20) +/* + * dump_stack() showed up in 2.4.20. + * show_stack is arch-specific + * The architecture-independent backtrace generator + */ +static inline void dump_stack(void) +{ +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,18) + // It seems that before 2.4.18 even show_stack is not available. + show_stack(0); +#endif +} +#endif + +// b_end_io handlers +extern void drbd_md_io_complete (struct buffer_head *bh, int uptodate); +extern void enslaved_read_bi_end_io (struct buffer_head *bh, int uptodate); +extern void drbd_dio_end_sec (struct buffer_head *bh, int uptodate); +extern void drbd_dio_end (struct buffer_head *bh, int uptodate); +extern void drbd_read_bi_end_io (struct buffer_head *bh, int uptodate); + +/* + * because in 2.6.x [sg]et_capacity operate on gendisk->capacity, which is in + * units of 512 bytes sectors, these wrappers have a <<1 or >>1 where + * appropriate. + */ + +static inline sector_t drbd_get_hardsect(kdev_t dev) +{ + return hardsect_size[MAJOR(dev)] ? + hardsect_size[MAJOR(dev)][MINOR(dev)] : 512; +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(kdev_t dev) +{ + return dev ? blk_size[MAJOR(dev)][MINOR(dev)]<<1 : 0; +} + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(drbd_dev *mdev, sector_t size) +{ + blk_size[MAJOR_NR][(int)(mdev - drbd_conf)] = (size>>1); +} + +//#warning "FIXME why don't we care for the return value?" +static inline void drbd_set_blocksize(drbd_dev *mdev, int blksize) +{ + set_blocksize(mdev->this_bdev, blksize); + if (mdev->backing_bdev) + set_blocksize(mdev->backing_bdev, blksize); + else D_ASSERT(mdev->backing_bdev); +} + +static inline int drbd_sync_me(drbd_dev *mdev) +{ + return fsync_dev(mdev->this_bdev); +} + +#define drbd_bio_uptodate(bio) buffer_uptodate(bio) + +static inline void drbd_bio_IO_error(struct buffer_head *bh) +{ + buffer_IO_error(bh); +} + +static inline void drbd_bio_endio(struct buffer_head *bh, int uptodate) +{ + bh->b_end_io(bh,uptodate); +} + +static inline drbd_dev* drbd_req_get_mdev(struct drbd_request *req) +{ + return (drbd_dev*) req->private_bio.b_private; +} + +static inline sector_t drbd_req_get_sector(struct drbd_request *req) +{ + return req->private_bio.b_blocknr; +} + +static inline unsigned short drbd_req_get_size(struct drbd_request *req) +{ + return req->private_bio.b_size; +} + +static inline drbd_bio_t* drbd_req_private_bio(struct drbd_request *req) +{ + return &req->private_bio; +} + +static inline sector_t drbd_ee_get_sector(struct Tl_epoch_entry *ee) +{ + return ee->private_bio.b_blocknr; +} + +static inline unsigned short drbd_ee_get_size(struct Tl_epoch_entry *ee) +{ + return ee->private_bio.b_size; +} + +static inline char *drbd_bio_kmap(struct buffer_head *bh) +{ + return bh_kmap(bh); +} + +static inline void drbd_bio_kunmap(struct buffer_head *bh) +{ + bh_kunmap(bh); +} + +static inline void drbd_ee_init(struct Tl_epoch_entry *e,struct page *page) +{ + struct buffer_head * const bh = &e->private_bio; + memset(e, 0, sizeof(*e)); + + // bh->b_list = BUF_LOCKED; // does it matter? + bh->b_size = PAGE_SIZE; + bh->b_this_page = bh; + bh->b_state = (1 << BH_Mapped); + init_waitqueue_head(&bh->b_wait); + set_bh_page(bh,page,0); + atomic_set(&bh->b_count, 1); + + e->block_id = ID_VACANT; +} + +static inline void drbd_bio_set_pages_dirty(struct buffer_head *bh) +{ + set_bit(BH_Dirty, &bh->b_state); +} + +static inline void drbd_bio_set_end_io(struct buffer_head *bh, bh_end_io_t * h) +{ + bh->b_end_io = h; +} + +static inline void +drbd_ee_bh_prepare(drbd_dev *mdev, struct buffer_head *bh, + sector_t sector, int size) +{ + D_ASSERT(mdev->backing_bdev); + + bh->b_blocknr = sector; // We abuse b_blocknr here. + bh->b_size = size; + bh->b_rsector = sector; + bh->b_rdev = mdev->backing_bdev; + bh->b_private = mdev; + bh->b_state = (1 << BH_Req) + |(1 << BH_Mapped) + |(1 << BH_Lock); +} + +static inline void +drbd_ee_prepare_write(drbd_dev *mdev, struct Tl_epoch_entry* e, + sector_t sector, int size) +{ + struct buffer_head * const bh = &e->private_bio; + + drbd_ee_bh_prepare(mdev,bh,sector,size); + set_bit(BH_Uptodate,&bh->b_state); + set_bit(BH_Dirty,&bh->b_state); + bh->b_end_io = drbd_dio_end_sec; +} + +static inline void +drbd_ee_prepare_read(drbd_dev *mdev, struct Tl_epoch_entry* e, + sector_t sector, int size) +{ + struct buffer_head * const bh = &e->private_bio; + + drbd_ee_bh_prepare(mdev,bh,sector,size); + bh->b_end_io = enslaved_read_bi_end_io; +} + +static inline void +drbd_bh_clone(struct buffer_head *bh, struct buffer_head *bh_src) +{ + memset(bh,0,sizeof(*bh)); + bh->b_list = bh_src->b_list; // BUF_LOCKED; + bh->b_size = bh_src->b_size; + bh->b_state = bh_src->b_state & ((1 << BH_PrivateStart)-1); + bh->b_page = bh_src->b_page; + bh->b_data = bh_src->b_data; + bh->b_rsector = bh_src->b_rsector; + bh->b_blocknr = bh_src->b_rsector; // We abuse b_blocknr here. + bh->b_dev = bh_src->b_dev; // hint for LVM as to + // which device to call fsync_dev + // on for snapshots + atomic_set(&bh->b_count, 1); + init_waitqueue_head(&bh->b_wait); + // other members stay NULL +} + +static inline void +drbd_req_prepare_write(drbd_dev *mdev, struct drbd_request *req) +{ + struct buffer_head * const bh = &req->private_bio; + struct buffer_head * const bh_src = req->master_bio; + + drbd_bh_clone(bh,bh_src); + bh->b_rdev = mdev->backing_bdev; + bh->b_private = mdev; + bh->b_end_io = drbd_dio_end; + + D_ASSERT(buffer_req(bh)); + D_ASSERT(buffer_locked(bh)); + D_ASSERT(buffer_mapped(bh)); + // D_ASSERT(buffer_dirty(bh)); // It is not true ?!? + /* kupdated keeps submitting "non-uptodate" buffers. + ERR_IF (!buffer_uptodate(bh)) { + ERR("[%s/%d]: bh_src->b_state=%lx bh->b_state=%lx\n", + current->comm, current->pid, + bh_src->b_state, bh->b_state); + }; + */ + + // FIXME should not be necessary; + // remove if the assertions above do not trigger. + bh->b_state = (1 << BH_Uptodate) + |(1 << BH_Dirty) + |(1 << BH_Lock) + |(1 << BH_Req) + |(1 << BH_Mapped) ; + + req->rq_status = RQ_DRBD_NOTHING; +} + +static inline void +drbd_req_prepare_read(drbd_dev *mdev, struct drbd_request *req) +{ + struct buffer_head * const bh = &req->private_bio; + struct buffer_head * const bh_src = req->master_bio; + + drbd_bh_clone(bh,bh_src); + bh->b_rdev = mdev->backing_bdev; + bh->b_private = mdev; + bh->b_end_io = drbd_read_bi_end_io; + + D_ASSERT(buffer_req(bh)); + D_ASSERT(buffer_locked(bh)); + D_ASSERT(buffer_mapped(bh)); + D_ASSERT(!buffer_uptodate(bh)); + + // FIXME should not be necessary; + // remove if the assertions above do not trigger. + bh->b_state = (1 << BH_Lock) + |(1 << BH_Req) + |(1 << BH_Mapped) ; + + req->rq_status = RQ_DRBD_NOTHING; +} + +static inline struct page* drbd_bio_get_page(struct buffer_head *bh) +{ + return bh->b_page; +} + +static inline void drbd_generic_make_request(int rw, struct buffer_head *bh) +{ + drbd_dev *mdev = drbd_conf -1 ; + + if (!bh->b_rdev) { + if (DRBD_ratelimit(5*HZ,5)) { + printk(KERN_ERR "drbd_generic_make_request: bh->b_rdev == NULL\n"); + dump_stack(); + } + drbd_bio_IO_error(bh); + return; + } + + generic_make_request(rw, bh); +} + +static inline void drbd_kick_lo(drbd_dev *mdev) +{ + run_task_queue(&tq_disk); +} + +static inline void drbd_plug_device(drbd_dev *mdev) +{ + D_ASSERT(mdev->state == Primary); + if (mdev->cstate < Connected) + return; + if (!test_and_set_bit(UNPLUG_QUEUED,&mdev->flags)) { + /* if it could not be queued, clear our flag again, too */ + if (!queue_task(&mdev->write_hint_tq, &tq_disk)) + clear_bit(UNPLUG_QUEUED,&mdev->flags); + } +} + +/* for increased performance, + * we try to use zero copy network send whenever possible. + * + * maybe TODO: + * find out whether we can use zero copy network recv, too, somehow. + * we'd need to define some sk_read_actor_t, and then use + * tcp_read_sock ... + */ +static inline int _drbd_send_zc_bio(drbd_dev *mdev, struct buffer_head *bh) +{ + struct page *page = bh->b_page; + size_t size = bh->b_size; + + return _drbd_send_page(mdev,page,bh_offset(bh),size); +} + +/* for proto A, we cannot use zero copy network send: + * we don't want to "ack" a send when we put a reference to it on the socket, + * but when it actually has reached the sendbuffer (so is likely to actually be + * on the wire in a couple of jiffies). + */ +static inline int _drbd_send_bio(drbd_dev *mdev, struct buffer_head *bh) +{ + size_t size = bh->b_size; + int ret; + + ret = drbd_send(mdev, mdev->data.socket, bh_kmap(bh), size, 0); + bh_kunmap(bh); + return ret; +} + +#else +// LINUX_VERSION_CODE > 2,5,0 + +#include // for fsync_bdev + +/* see get_sb_bdev and bd_claim */ +extern char* drbd_sec_holder; + +// bi_end_io handlers +// int (bio_end_io_t) (struct bio *, unsigned int, int); +extern int drbd_md_io_complete (struct bio *bio, unsigned int bytes_done, int error); +extern int enslaved_read_bi_end_io (struct bio *bio, unsigned int bytes_done, int error); +extern int drbd_dio_end_sec (struct bio *bio, unsigned int bytes_done, int error); +extern int drbd_dio_end (struct bio *bio, unsigned int bytes_done, int error); +extern int drbd_read_bi_end_io (struct bio *bio, unsigned int bytes_done, int error); + +static inline sector_t drbd_get_hardsect(struct block_device *bdev) +{ + return bdev->bd_disk->queue->hardsect_size; +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(struct block_device *bdev) +{ + return bdev ? bdev->bd_inode->i_size >> 9 : 0; +} + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(drbd_dev *mdev, sector_t size) +{ + set_capacity(mdev->vdisk,size); + mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +//#warning "FIXME why don't we care for the return value?" +static inline void drbd_set_blocksize(drbd_dev *mdev, int blksize) +{ + set_blocksize(mdev->this_bdev,blksize); + if (mdev->backing_bdev) { + set_blocksize(mdev->backing_bdev, blksize); + } else { + D_ASSERT(mdev->backing_bdev); + // FIXME send some package over to the peer? + } +} + +static inline int drbd_sync_me(drbd_dev *mdev) +{ + return fsync_bdev(mdev->this_bdev); +} + +#define drbd_bio_uptodate(bio) bio_flagged(bio,BIO_UPTODATE) + +static inline void drbd_bio_IO_error(struct bio *bio) +{ + bio_endio(bio,bio->bi_size,-EIO); +} + +static inline void drbd_bio_endio(struct bio *bio, int uptodate) +{ + bio_endio(bio,bio->bi_size,uptodate ? 0 : -EIO); +} + +static inline drbd_dev* drbd_req_get_mdev(struct drbd_request *req) +{ + return (drbd_dev*) req->mdev; +} + +static inline sector_t drbd_req_get_sector(struct drbd_request *req) +{ + return req->master_bio->bi_sector; +} + +static inline unsigned short drbd_req_get_size(struct drbd_request *req) +{ + drbd_dev* mdev = req->mdev; + D_ASSERT(req->master_bio->bi_size); + return req->master_bio->bi_size; +} + +static inline drbd_bio_t* drbd_req_private_bio(struct drbd_request *req) +{ + return req->private_bio; +} + +static inline sector_t drbd_ee_get_sector(struct Tl_epoch_entry *ee) +{ + return ee->ee_sector; +} + +static inline unsigned short drbd_ee_get_size(struct Tl_epoch_entry *ee) +{ + return ee->ee_size; +} + +#ifdef CONFIG_HIGHMEM +/* + * I don't know why there is no bvec_kmap, only bvec_kmap_irq ... + * + * we do a sock_recvmsg into the target buffer, + * so we obviously cannot use the bvec_kmap_irq variant. -lge + * + * Most likely it is only due to performance anyways: + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +static inline char *drbd_bio_kmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + unsigned long addr; + + addr = (unsigned long) kmap(bvec->bv_page); + + if (addr & ~PAGE_MASK) + BUG(); + + return (char *) addr + bvec->bv_offset; +} + +static inline void drbd_bio_kunmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + + kunmap(bvec->bv_page); +} + +#else +static inline char *drbd_bio_kmap(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + return page_address(bvec->bv_page) + bvec->bv_offset; +} +static inline void drbd_bio_kunmap(struct bio *bio) +{ + // do nothing. +} +#endif + +static inline void drbd_ee_init(struct Tl_epoch_entry *e,struct page *page) +{ + struct bio * const bio = &e->private_bio; + struct bio_vec * const vec = &e->ee_bvec; + + memset(e, 0, sizeof(*e)); + bio_init(bio); + + bio->bi_io_vec = vec; + bio->bi_destructor = NULL; + vec->bv_page = page; + bio->bi_size = vec->bv_len = PAGE_SIZE; + bio->bi_max_vecs = bio->bi_vcnt = 1; + vec->bv_offset = 0; + + e->block_id = ID_VACANT; +} + +static inline void drbd_bio_set_pages_dirty(struct bio *bio) +{ + bio_set_pages_dirty(bio); +} + +static inline void drbd_bio_set_end_io(struct bio *bio, bio_end_io_t * h) +{ + bio->bi_end_io = h; +} + +static inline void +drbd_ee_bio_prepare(drbd_dev *mdev, struct Tl_epoch_entry* e, + sector_t sector, int size) +{ + struct bio * const bio = &e->private_bio; + struct bio_vec * const vec = &e->ee_bvec; + struct page * const page = vec->bv_page; + D_ASSERT(mdev->backing_bdev); + + /* Clear plate. */ + bio_init(bio); + + bio->bi_io_vec = vec; + bio->bi_destructor = NULL; + vec->bv_page = page; + vec->bv_offset = 0; + bio->bi_max_vecs = bio->bi_vcnt = 1; + + bio->bi_bdev = mdev->backing_bdev; + bio->bi_private = mdev; + + e->ee_sector = bio->bi_sector = sector; + e->ee_size = bio->bi_size = bio->bi_io_vec->bv_len = size; +} + +static inline void +drbd_ee_prepare_write(drbd_dev *mdev, struct Tl_epoch_entry* e, + sector_t sector, int size) +{ + drbd_ee_bio_prepare(mdev,e,sector,size); + e->private_bio.bi_end_io = drbd_dio_end_sec; +} + +static inline void +drbd_ee_prepare_read(drbd_dev *mdev, struct Tl_epoch_entry* e, + sector_t sector, int size) +{ + drbd_ee_bio_prepare(mdev,e,sector,size); + e->private_bio.bi_end_io = enslaved_read_bi_end_io; +} + +static inline void +drbd_req_prepare_write(drbd_dev *mdev, struct drbd_request *req) +{ + struct bio *bio; + + bio = req->private_bio = bio_clone(req->master_bio, GFP_NOIO ); + bio->bi_bdev = mdev->backing_bdev; + bio->bi_private = req; + bio->bi_end_io = drbd_dio_end; + bio->bi_next = 0; + + req->rq_status = RQ_DRBD_NOTHING; + req->mdev = mdev; +} + +static inline void +drbd_req_prepare_read(drbd_dev *mdev, struct drbd_request *req) +{ + struct bio *bio; + + bio = req->private_bio = bio_clone(req->master_bio, GFP_NOIO ); + bio->bi_bdev = mdev->backing_bdev; + bio->bi_private = req; + bio->bi_end_io = drbd_read_bi_end_io; // <- only difference + bio->bi_next = 0; + + req->rq_status = RQ_DRBD_NOTHING; + req->mdev = mdev; +} + +static inline struct page* drbd_bio_get_page(struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + return bvec->bv_page; +} + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(int rw, struct bio *bio) +{ + drbd_dev *mdev = drbd_conf -1; // for DRBD_ratelimit + bio->bi_rw = rw; // on the receiver side, e->..rw was not yet defined. + + if (!bio->bi_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + printk(KERN_ERR "drbd_generic_make_request: bio->bi_bdev == NULL\n"); + dump_stack(); + } + drbd_bio_IO_error(bio); + return; + } + + generic_make_request(bio); +} + +static inline void drbd_blk_run_queue(request_queue_t *q) +{ + if (q && q->unplug_fn) + q->unplug_fn(q); +} + +static inline void drbd_kick_lo(drbd_dev *mdev) +{ + if (!mdev->backing_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("backing_bdev==NULL in drbd_kick_lo\n"); + dump_stack(); + } + } else { + drbd_blk_run_queue(bdev_get_queue(mdev->backing_bdev)); + } +} + +static inline void drbd_plug_device(drbd_dev *mdev) +{ + request_queue_t *q = bdev_get_queue(mdev->this_bdev); + + spin_lock_irq(q->queue_lock); + +/* XXX the check on !blk_queue_plugged is redundant, + * implicitly checked in blk_plug_device */ + + if(!blk_queue_plugged(q)) { + blk_plug_device(q); + del_timer(&q->unplug_timer); + // unplugging should not happen automatically... + } + spin_unlock_irq(q->queue_lock); +} + +static inline int _drbd_send_zc_bio(drbd_dev *mdev, struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec_idx(bio, bio->bi_idx); + return _drbd_send_page(mdev,bvec->bv_page,bvec->bv_offset,bvec->bv_len); +} + +static inline int _drbd_send_bio(drbd_dev *mdev, struct bio *bio) +{ + struct bio_vec *bvec = bio_iovec(bio); + struct page *page = bvec->bv_page; + size_t size = bvec->bv_len; + int offset = bvec->bv_offset; + int ret; + + ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + return ret; +} + +#endif --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_fs.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,1460 @@ +/* +-*- linux-c -*- + drbd_fs.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 1999-2004, Philipp Reisner . + main author. + + Copyright (C) 2002-2004, Lars Ellenberg . + main contributor. + + Copyright (C) 2000, Fábio Olivé Leite . + Some sanity checks in IOCTL_SET_STATE. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" + +#include + +ONLY_IN_26( +/* see get_sb_bdev and bd_claim */ +char *drbd_sec_holder = "Secondary DRBD cannot be bd_claimed ;)"; +char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; +) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +STATIC enum { NotMounted=0,MountedRO,MountedRW } drbd_is_mounted(int minor) +{ + struct super_block *sb; + + sb = get_super(MKDEV(MAJOR_NR, minor)); + if(!sb) return NotMounted; + + if(sb->s_flags & MS_RDONLY) { + drop_super(sb); + return MountedRO; + } + + drop_super(sb); + return MountedRW; +} +#endif + +char* ppsize(char* buf, size_t size) +{ + // Needs 9 bytes at max. + static char units[] = { 'K','M','G','T' }; + int base = 0; + while (size >= 10000 ) { + size = size >> 10; + base++; + } + sprintf(buf,"%ld %cB",(long)size,units[base]); + + return buf; +} + +/* Returns -ENOMEM if we could not allocate the bitmap + * + * currently *_size is in KB. + * + * FIXME + * since this is done by drbd receiver as well as from drbdsetup, + * this actually needs proper locking! + * drbd_bm_resize already protects itself with a mutex. + * but again, this is a state change, and thus should be serialized with other + * state changes on a more general level already. + */ +int drbd_determin_dev_size(struct Drbd_Conf* mdev) +{ + sector_t pmdss; // previous meta data start sector + sector_t la_size; + sector_t size; + char ppb[10]; + + int md_moved, la_size_changed; + int rv=0; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + pmdss = drbd_md_ss(mdev); + la_size = mdev->la_size; + + size = drbd_new_dev_size(mdev); + + if( (drbd_get_capacity(mdev->this_bdev)>>1) != size ) { + int err; + err = drbd_bm_resize(mdev,size<<1); // wants sectors + if (unlikely(err)) { + /* currently there is only one error: ENOMEM! */ + size = drbd_bm_capacity(mdev)>>1; + if (size == 0) { + ERR("OUT OF MEMORY! Could not allocate bitmap! Set device size => 0\n"); + } else { + /* FIXME this is problematic, + * if we in fact are smaller now! */ + ERR("BM resizing failed. " + "Leaving size unchanged at size = %lu KB\n", + (unsigned long)size); + } + rv = err; + } + // racy, see comments above. + drbd_set_my_capacity(mdev,size<<1); + mdev->la_size = size; + INFO("size = %s (%lu KB)\n",ppsize(ppb,size), + (unsigned long)size); + } + if (rv < 0) goto out; + + la_size_changed = (la_size != mdev->la_size); + md_moved = pmdss != drbd_md_ss(mdev) /* && mdev->md_index == -1 */; + + if ( md_moved ) { + WARN("Moving meta-data.\n"); + D_ASSERT(mdev->md_index == -1); + } + + if ( la_size_changed || md_moved ) { + if( inc_local_md_only(mdev)) { + drbd_al_shrink(mdev); // All extents inactive. + drbd_bm_write(mdev); // write bitmap + // Write mdev->la_size to on disk. + drbd_md_write(mdev); + dec_local(mdev); + } + } + out: + lc_unlock(mdev->act_log); + + return rv; +} + +/* + * currently *_size is in KB. + */ +sector_t drbd_new_dev_size(struct Drbd_Conf* mdev) +{ + sector_t p_size = mdev->p_size; // partner's disk size. + sector_t la_size = mdev->la_size; // last agreed size. + sector_t m_size; // my size + sector_t u_size = mdev->lo_usize; // size requested by user. + sector_t size=0; + + m_size = drbd_get_capacity(mdev->backing_bdev)>>1; + + if (mdev->md_index == -1 && m_size) {// internal metadata + D_ASSERT(m_size > MD_RESERVED_SIZE); + m_size = drbd_md_ss(mdev)>>1; + } + + if(p_size && m_size) { + size=min_t(sector_t,p_size,m_size); + } else { + if(la_size) { + size=la_size; + if(m_size && m_size < size) size=m_size; + if(p_size && p_size < size) size=p_size; + } else { + if(m_size) size=m_size; + if(p_size) size=p_size; + } + } + + if(size == 0) { + ERR("Both nodes diskless!\n"); + } + + if(u_size) { + if(u_size > size) { + ERR("Requested disk size is too big (%lu > %lu)\n", + (unsigned long)u_size, (unsigned long)size); + } else { + size = u_size; + } + } + + return size; +} + +/* checks that the al lru is of requested size, and if neccessary tries to + * allocate a new one. returns -EBUSY if current al lru is still used, + * -ENOMEM when allocation failed, and 0 on success. + */ +STATIC int drbd_check_al_size(drbd_dev *mdev) +{ + struct lru_cache *n,*t; + struct lc_element *e; + unsigned int in_use; + int i; + + ERR_IF(mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + if ( mdev->act_log && + mdev->act_log->nr_elements == mdev->sync_conf.al_extents ) + return 0; + + in_use = 0; + t = mdev->act_log; + n = lc_alloc(mdev->sync_conf.al_extents, + sizeof(struct lc_element), mdev); + + if (n==NULL) { + ERR("Cannot allocate act_log lru!\n"); + return -ENOMEM; + } + spin_lock_irq(&mdev->al_lock); + if (t) { + for (i=0; i < t->nr_elements; i++) { + e = lc_entry(t,i); + if (e->refcnt) + ERR("refcnt(%d)==%d\n", + e->lc_number, e->refcnt); + in_use += e->refcnt; + } + } + if (!in_use) { + mdev->act_log = n; + } + spin_unlock_irq(&mdev->al_lock); + if (in_use) { + ERR("Activity log still in use!\n"); + lc_free(n); + return -EBUSY; + } else { + if (t) lc_free(t); + } + drbd_md_write(mdev); + return 0; +} + +STATIC int drbd_detach_ioctl(drbd_dev *mdev); + +STATIC +int drbd_ioctl_set_disk(struct Drbd_Conf *mdev, + struct ioctl_disk_config * arg) +{ + NOT_IN_26(int err;) // unused in 26 ?? cannot believe it ... + int i, md_gc_valid, minor, mput=0, apply_al; + enum ret_codes retcode; + struct disk_config new_conf; + struct file *filp = 0; + struct file *filp2 = 0; + struct inode *inode, *inode2; + NOT_IN_26(kdev_t bdev, bdev2;) + ONLY_IN_26(struct block_device *bdev, *bdev2;) + + minor=(int)(mdev-drbd_conf); + + /* if you want to reconfigure, please tear down first */ + smp_rmb(); + if (!test_bit(DISKLESS,&mdev->flags)) + return -EBUSY; + + /* if this was "adding" a lo dev to a previously "diskless" node, + * there still could be requests comming in right now. brrks. + * if it was mounted, we had an open_cnt > 1, + * so it would be BUSY anyways... + */ + ERR_IF (mdev->state != Secondary) + return -EBUSY; + + if (mdev->open_cnt > 1) + return -EBUSY; + + if (copy_from_user(&new_conf, &arg->config,sizeof(struct disk_config))) + return -EFAULT; + + /* FIXME + * I'd like to do it here, so I can just fail this ioctl with ENOMEM. + * but drbd_md_read below might change the al_nr_extens again, so need + * to do it there again anyways... + * but then I already changed it all and cannot easily undo it.. + * for now, do it there, but then if it fails, rather panic than later + * have a NULL pointer dereference. + * + i = drbd_check_al_size(mdev); + if (i) return i; + * + */ + + if (mdev->cstate == Unconfigured) { + // ioctl already has a refcnt + __module_get(THIS_MODULE); + mput = 1; + } else { + /* We currently cannot handle reattach while connected */ + return -EBUSY; + + /* FIXME allow reattach while connected, + * and allow it in Primary/Diskless state... + * currently there are strange races leading to a distributed + * deadlock in that case... + */ + if ( mdev->cstate != StandAlone /* && + mdev->cstate != Connected */) { + return -EBUSY; + } + } + + if ( new_conf.meta_index < -1) { + retcode=LDMDInvalid; + goto fail_ioctl; + } + + filp = fget(new_conf.lower_device); + if (!filp) { + retcode=LDFDInvalid; + goto fail_ioctl; + } + + inode = filp->f_dentry->d_inode; + + if (!S_ISBLK(inode->i_mode)) { + retcode=LDNoBlockDev; + goto fail_ioctl; + } + + filp2 = fget(new_conf.meta_device); + + if (!filp2) { + retcode=MDFDInvalid; + goto fail_ioctl; + } + + inode2 = filp2->f_dentry->d_inode; + + if (!S_ISBLK(inode2->i_mode)) { + retcode=MDNoBlockDev; + goto fail_ioctl; + } + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0) + bdev = inode->i_bdev; + if (bd_claim(bdev, mdev)) { + retcode=LDMounted; + goto fail_ioctl; + } + + bdev2 = inode2->i_bdev; + if (bd_claim(bdev2, new_conf.meta_index== - 1 ? + (void *)mdev : (void*) drbd_m_holder )) { + retcode=MDMounted; + goto release_bdev_fail_ioctl; + } +#else + for(i=0;ii_rdev == drbd_conf[i].backing_bdev) { + retcode=LDAlreadyInUse; + goto fail_ioctl; + } + } + + if (drbd_is_mounted(inode->i_rdev)) { + WARN("can not configure %d:%d, has active inodes!\n", + MAJOR(inode->i_rdev), MINOR(inode->i_rdev)); + retcode=LDMounted; + goto fail_ioctl; + } + + if ((err = blkdev_open(inode, filp))) { + ERR("blkdev_open( %d:%d ,) returned %d\n", + MAJOR(inode->i_rdev), MINOR(inode->i_rdev), err); + retcode=LDOpenFailed; + goto fail_ioctl; + } + bdev = inode->i_rdev; + + if ((err = blkdev_open(inode2, filp2))) { + ERR("blkdev_open( %d:%d ,) returned %d\n", + MAJOR(inode->i_rdev), MINOR(inode->i_rdev), err); + retcode=MDOpenFailed; + goto release_bdev_fail_ioctl; + } + bdev2 = inode2->i_rdev; +#endif + + if ( (bdev == bdev2) != (new_conf.meta_index == -1) ) { + retcode=LDMDInvalid; + goto release_bdev2_fail_ioctl; + } + + if ((drbd_get_capacity(bdev)>>1) < new_conf.disk_size) { + retcode = LDDeviceTooSmall; + goto release_bdev2_fail_ioctl; + } + + if (drbd_get_capacity(bdev) > DRBD_MAX_SECTORS) { + retcode = LDDeviceTooLarge; + goto release_bdev2_fail_ioctl; + } + + if ( new_conf.meta_index == -1 ) i = 1; + else i = new_conf.meta_index+1; + + /* for internal, we need to check agains <= (then we have a drbd with + * zero size, but meta data...) to be on the safe side, I require 32MB + * minimal data storage area for drbd with internal meta data (thats + * 160 total). if someone wants to use that small devices, she can use + * drbd 0.6 anyways... + * + * FIXME this is arbitrary and needs to be reconsidered as soon as we + * move to flexible size meta data. + */ + if( drbd_get_capacity(bdev2) < 2*MD_RESERVED_SIZE*i + + (new_conf.meta_index == -1) ? (1<<16) : 0 ) + { + retcode = MDDeviceTooSmall; + goto release_bdev2_fail_ioctl; + } + + drbd_free_ll_dev(mdev); + + mdev->md_bdev = bdev2; + mdev->md_file = filp2; + mdev->md_index = new_conf.meta_index; + + mdev->backing_bdev = bdev; + mdev->lo_file = filp; + mdev->lo_usize = new_conf.disk_size; + mdev->on_io_error = new_conf.on_io_error; + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + mdev->read_cnt = 0; + mdev->writ_cnt = 0; + +// FIXME unclutter the code again ;) +/* + * Returns the minimum that is _not_ zero, unless both are zero. + */ +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) +ONLY_IN_26({ + request_queue_t * const q = mdev->rq_queue; + request_queue_t * const b = bdev->bd_disk->queue; + + q->max_sectors = min_not_zero((unsigned short)(PAGE_SIZE >> 9), b->max_sectors); + q->max_phys_segments = 1; + q->max_hw_segments = 1; + q->max_segment_size = min((unsigned)PAGE_SIZE,b->max_segment_size); + q->hardsect_size = max((unsigned short)512,b->hardsect_size); + q->seg_boundary_mask = PAGE_SIZE-1; + D_ASSERT(q->hardsect_size <= PAGE_SIZE); // or we are really screwed ;-) + + if( q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + INFO("Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } + +}) +#undef min_not_zero + + clear_bit(SENT_DISK_FAILURE,&mdev->flags); + set_bit(MD_IO_ALLOWED,&mdev->flags); + +/* FIXME I think inc_local_md_only within drbd_md_read is misplaced. + * should go here, and the corresponding dec_local, too. + */ + + md_gc_valid = drbd_md_read(mdev); + +/* FIXME if (md_gc_valid < 0) META DATA IO NOT POSSIBLE! */ + + /* If I am currently not Primary, + * but meta data primary indicator is set, + * I just now recover from a hard crash, + * and have been Primary before that crash. + * + * Now, if I had no connection before that crash + * (have been degraded Primary), chances are that + * I won't find my peer now either. + * + * In that case, and _only_ in that case, + * we use the degr-wfc-timeout instead of the default, + * so we can automatically recover from a crash of a + * degraded but active "cluster" after a certain timeout. + */ + clear_bit(USE_DEGR_WFC_T,&mdev->flags); + if ( mdev->state != Primary && + drbd_md_test_flag(mdev,MDF_PrimaryInd) && + !drbd_md_test_flag(mdev,MDF_ConnectedInd) ) { + set_bit(USE_DEGR_WFC_T,&mdev->flags); + } + + drbd_bm_lock(mdev); // racy... + + if(drbd_md_test_flag(mdev,MDF_Consistent) && + drbd_new_dev_size(mdev) < mdev->la_size ) { + D_ASSERT(mdev->cstate == Unconfigured); + D_ASSERT(mput == 1); + /* Do not attach a too small disk.*/ + drbd_bm_unlock(mdev); + ERR("Lower device smaller than last agreed size!\n"); + drbd_free_ll_dev(mdev); + set_cstate(mdev,Unconfigured); + retcode = LDDeviceTooSmall; + module_put(THIS_MODULE); + if (put_user(retcode, &arg->ret_code)) return -EFAULT; + return -EINVAL; + } + + apply_al = drbd_md_test_flag(mdev,MDF_PrimaryInd); + /* All tests on MDF_PrimaryInd and MDF_ConnectedInd must happen before + this point, because determin_dev_size() might call drbd_md_write(), + which in turn modifies these flags. Exceptions are where, we want + to test the current state (drbd_md_compare(), drbd_send_param()). */ + + if (drbd_determin_dev_size(mdev) < 0) { + /* could not allocate bitmap. + * try to undo ... */ + D_ASSERT(mdev->cstate == Unconfigured); + D_ASSERT(mput == 1); + + drbd_bm_unlock(mdev); + + /* from drbd_detach_ioctl */ + drbd_free_ll_dev(mdev); + + set_cstate(mdev,Unconfigured); + drbd_mdev_cleanup(mdev); + module_put(THIS_MODULE); + return -ENOMEM; + } + + if (md_gc_valid <= 0) { + INFO("Assuming that all blocks are out of sync (aka FullSync)\n"); + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_write(mdev); + } else { // md_gc_valid > 0 + /* FIXME this still does not propagate io errors! */ + drbd_bm_read(mdev); + } + + i = drbd_check_al_size(mdev); + if (i) { + /* FIXME see the comment above. + * if this fails I need to undo all changes, + * go back into Unconfigured, + * and fail the ioctl with ENOMEM... + */ + // return i; + drbd_panic("Cannot allocate act_log\n"); + } + + if (md_gc_valid > 0) { + i = drbd_al_read_log(mdev); + if (apply_al && i) { + drbd_al_apply_to_bm(mdev); + drbd_al_to_on_disk_bm(mdev); + } + if(!i) { + ERR("IO error on meta device while reading AL\n"); + drbd_free_ll_dev(mdev); + set_cstate(mdev,Unconfigured); + retcode = MDIOError; + module_put(THIS_MODULE); + if (put_user(retcode, &arg->ret_code)) return -EFAULT; + return -EINVAL; + } + } /* else { + FIXME wipe out on disk al! + } */ + + drbd_set_blocksize(mdev,INITIAL_BLOCK_SIZE); + + if(mdev->cstate == Unconfigured ) { + drbd_thread_start(&mdev->worker); + set_cstate(mdev,StandAlone); + } + + + clear_bit(DISKLESS,&mdev->flags); + smp_wmb(); +// FIXME EXPLAIN: + clear_bit(MD_IO_ALLOWED,&mdev->flags); + + /* FIXME currently only StandAlone here... + * Connected is not possible, since + * above we return -EBUSY in that case */ + D_ASSERT(mdev->cstate <= Connected); + if(mdev->cstate == Connected ) { + drbd_send_param(mdev,1); + } + drbd_bm_unlock(mdev); + + return 0; + + release_bdev2_fail_ioctl: + NOT_IN_26(blkdev_put(filp2->f_dentry->d_inode->i_bdev,BDEV_FILE);) + ONLY_IN_26(bd_release(bdev2);) + release_bdev_fail_ioctl: + NOT_IN_26(blkdev_put(filp->f_dentry->d_inode->i_bdev,BDEV_FILE);) + ONLY_IN_26(bd_release(bdev);) + fail_ioctl: + if (mput) module_put(THIS_MODULE); + if (filp) fput(filp); + if (filp2) fput(filp2); + if (put_user(retcode, &arg->ret_code)) return -EFAULT; + return -EINVAL; +} + +STATIC +int drbd_ioctl_get_conf(struct Drbd_Conf *mdev, struct ioctl_get_config* arg) +{ + struct ioctl_get_config cn; + memset(&cn,0,sizeof(cn)); + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + if (mdev->backing_bdev) { + cn.lower_device_major = MAJOR(mdev->backing_bdev->bd_dev); + cn.lower_device_minor = MINOR(mdev->backing_bdev->bd_dev); + bdevname(mdev->backing_bdev,cn.lower_device_name); + } + if (mdev->md_bdev) { + cn.meta_device_major = MAJOR(mdev->md_bdev->bd_dev); + cn.meta_device_minor = MINOR(mdev->md_bdev->bd_dev); + bdevname(mdev->md_bdev,cn.meta_device_name); + } +#else + cn.lower_device_major=MAJOR(mdev->backing_bdev); + cn.lower_device_minor=MINOR(mdev->backing_bdev); + cn.meta_device_major=MAJOR(mdev->md_bdev); + cn.meta_device_minor=MINOR(mdev->md_bdev); + if (mdev->backing_bdev) { + strncpy(cn.lower_device_name, + bdevname(mdev->backing_bdev), BDEVNAME_SIZE); + } + if (mdev->md_bdev) { + strncpy(cn.meta_device_name, + bdevname(mdev->md_bdev), BDEVNAME_SIZE); + } +#endif + cn.cstate=mdev->cstate; + cn.state=mdev->state; + cn.peer_state=mdev->o_state; + cn.disk_size_user=mdev->lo_usize; + cn.meta_index=mdev->md_index; + cn.on_io_error=mdev->on_io_error; + memcpy(&cn.nconf, &mdev->conf, sizeof(struct net_config)); + memcpy(&cn.sconf, &mdev->sync_conf, sizeof(struct syncer_config)); + + if (copy_to_user(arg,&cn,sizeof(struct ioctl_get_config))) + return -EFAULT; + + return 0; +} + + +STATIC +int drbd_ioctl_set_net(struct Drbd_Conf *mdev, struct ioctl_net_config * arg) +{ + int i,minor, mput=0; + enum ret_codes retcode; + struct net_config new_conf; + + minor=(int)(mdev-drbd_conf); + + // FIXME plausibility check + if (copy_from_user(&new_conf, &arg->config,sizeof(struct net_config))) + return -EFAULT; + + if (mdev->cstate == Unconfigured) { + // ioctl already has a refcnt + __module_get(THIS_MODULE); + mput = 1; + } + +#define M_ADDR(A) (((struct sockaddr_in *)&A.my_addr)->sin_addr.s_addr) +#define M_PORT(A) (((struct sockaddr_in *)&A.my_addr)->sin_port) +#define O_ADDR(A) (((struct sockaddr_in *)&A.other_addr)->sin_addr.s_addr) +#define O_PORT(A) (((struct sockaddr_in *)&A.other_addr)->sin_port) + for(i=0;ireceiver); + drbd_free_sock(mdev); + + // TODO plausibility check ... + memcpy(&mdev->conf,&new_conf,sizeof(struct net_config)); + +#if 0 +FIXME + /* for the connection loss logic in drbd_recv + * I _need_ the resulting timeo in jiffies to be + * non-zero and different + * + * XXX maybe rather store the value scaled to jiffies? + * Note: MAX_SCHEDULE_TIMEOUT/HZ*HZ != MAX_SCHEDULE_TIMEOUT + * and HZ > 10; which is unlikely to change... + * Thus, if interrupted by a signal, + * sock_{send,recv}msg returns -EINTR, + * if the timeout expires, -EAGAIN. + */ + // unlikely: someone disabled the timeouts ... + // just put some huge values in there. + if (!mdev->conf.ping_int) + mdev->conf.ping_int = MAX_SCHEDULE_TIMEOUT/HZ; + if (!mdev->conf.timeout) + mdev->conf.timeout = MAX_SCHEDULE_TIMEOUT/HZ*10; + if (mdev->conf.ping_int*10 < mdev->conf.timeout) + mdev->conf.timeout = mdev->conf.ping_int*10/6; + if (mdev->conf.ping_int*10 == mdev->conf.timeout) + mdev->conf.ping_int = mdev->conf.ping_int+1; +#endif + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + + drbd_thread_start(&mdev->worker); + set_cstate(mdev,Unconnected); + drbd_thread_start(&mdev->receiver); + + return 0; + + fail_ioctl: + if (mput) module_put(THIS_MODULE); + if (put_user(retcode, &arg->ret_code)) return -EFAULT; + return -EINVAL; +} + +int drbd_set_state(drbd_dev *mdev,Drbd_State newstate) +{ + int forced = 0; + int dont_have_good_data; + NOT_IN_26(int minor = mdev-drbd_conf;) + + D_ASSERT(semaphore_is_locked(&mdev->device_mutex)); + + if ( (newstate & 0x3) == mdev->state ) return 0; /* nothing to do */ + + // exactly one of sec or pri. not both. + if ( !((newstate ^ (newstate >> 1)) & 1) ) return -EINVAL; + + if(mdev->cstate == Unconfigured) + return -ENXIO; + + if ( (newstate & Primary) && (mdev->o_state == Primary) ) + return -EACCES; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + smp_rmb(); + if ( (newstate & Secondary) && + (test_bit(WRITER_PRESENT,&mdev->flags) || + drbd_is_mounted(minor) == MountedRW)) + return -EBUSY; +#else + ERR_IF (mdev->this_bdev->bd_contains == 0) { + // FIXME this masks a bug somewhere else! + mdev->this_bdev->bd_contains = mdev->this_bdev; + } + + if ( newstate & Secondary ) { + /* If I got here, I am Primary. I claim me for myself. If that + * does not succeed, someone other has claimed me, so I cannot + * become Secondary. */ + if (bd_claim(mdev->this_bdev,drbd_sec_holder)) + return -EBUSY; + if (disable_bd_claim) + bd_release(mdev->this_bdev); + } +#endif + + + /* I dont have access to good data anywhere, if: + * ( I am diskless OR inconsistent ) + * AND + * ( not connected, or partner has no consistent data either ) + */ + dont_have_good_data = + ( test_bit(DISKLESS, &mdev->flags) + || !drbd_md_test_flag(mdev,MDF_Consistent) ) + && + ( mdev->cstate < Connected + || test_bit(PARTNER_DISKLESS, &mdev->flags) + || !test_bit(PARTNER_CONSISTENT, &mdev->flags) ); + + if (newstate & Primary) { + if ( test_bit(DISKLESS,&mdev->flags) + && mdev->cstate < Connected ) { + /* not even brute force can find data without disk. + * FIXME choose a usefull Error, + * and update drbsetup accordingly */ + return -EIO; + } else if (dont_have_good_data) { + /* ok, either we have a disk (which may be inconsistent) + * or we have a connection */ + if (newstate & DontBlameDrbd) { + forced = 1; + /* make sure the Human count is increased if + * we got here only because it was forced. + * maybe we want to force a FullSync? */ + newstate |= Human; + } else { + return -EIO; + } + } else if (mdev->cstate >= Connected) { + /* do NOT increase the Human count if we are connected, + * and there is no reason for it. See + * drbd_lk9.pdf middle of Page 7 + */ + newstate &= ~(Human|DontBlameDrbd); + } + } + + drbd_sync_me(mdev); + + /* Wait until nothing is on the fly :) */ + if ( wait_event_interruptible( mdev->cstate_wait, + atomic_read(&mdev->ap_pending_cnt) == 0 ) ) { +ONLY_IN_26( + if ( newstate & Secondary ) { + D_ASSERT(mdev->this_bdev->bd_holder == drbd_sec_holder); + bd_release(mdev->this_bdev); + } +) + return -EINTR; + } + + /* FIXME RACE here: if our direct user is not using bd_claim (i.e. + * not a filesystem) since cstate might still be >= Connected, new + * ap requests may come in and increase ap_pending_cnt again! + * but that means someone is misusing DRBD... + * */ + + if (forced) { /* this was --do-what-I-say ... */ + int i; + // drbd_dump_md(mdev,0,0); + for (i=HumanCnt; i < GEN_CNT_SIZE ; i++) { + if (mdev->gen_cnt[i] != 1) { + WARN("Forcefully set consistent! " + "If this screws your data, don't blame DRBD!\n"); + break; + } + } + drbd_md_set_flag(mdev,MDF_Consistent); + } + set_bit(MD_DIRTY,&mdev->flags); // we are changing state! + INFO( "%s/%s --> %s/%s\n", + nodestate_to_name(mdev->state), + nodestate_to_name(mdev->o_state), + nodestate_to_name(newstate & 0x03), + nodestate_to_name(mdev->o_state) ); + mdev->state = (Drbd_State) newstate & 0x03; + if(newstate & Primary) { + NOT_IN_26( set_device_ro(MKDEV(MAJOR_NR, minor), FALSE ); ) + +ONLY_IN_26( + set_disk_ro(mdev->vdisk, FALSE ); + D_ASSERT(mdev->this_bdev->bd_holder == drbd_sec_holder); + bd_release(mdev->this_bdev); + mdev->this_bdev->bd_disk = mdev->vdisk; +) + + if(test_bit(ON_PRI_INC_HUMAN,&mdev->flags)) { + newstate |= Human; + clear_bit(ON_PRI_INC_HUMAN,&mdev->flags); + } + + if(test_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags)) { + newstate |= TimeoutExpired; + clear_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags); + } + + if(newstate & Human) { + drbd_md_inc(mdev,HumanCnt); + } else if(newstate & TimeoutExpired ) { + drbd_md_inc(mdev,TimeoutCnt); + } else { + drbd_md_inc(mdev, + mdev->cstate >= Connected ? + ConnectedCnt : ArbitraryCnt); + } + } else { + NOT_IN_26( set_device_ro(MKDEV(MAJOR_NR, minor), TRUE ); ) + ONLY_IN_26( set_disk_ro(mdev->vdisk, TRUE ); ) + } + + if(!test_bit(DISKLESS,&mdev->flags) && (newstate & Secondary)) { + drbd_al_to_on_disk_bm(mdev); + } + /* Primary indicator has changed in any case. */ + drbd_md_write(mdev); + + if (mdev->cstate >= WFReportParams) { + /* if this was forced, we should consider sync */ + drbd_send_param(mdev,forced); + } + + return 0; +} + +static int drbd_get_wait_time(long *tp, struct Drbd_Conf *mdev, + struct ioctl_wait *arg) +{ + long time; + struct ioctl_wait p; + + if(copy_from_user(&p,arg,sizeof(p))) { + return -EFAULT; + } + + if ( test_bit(USE_DEGR_WFC_T,&mdev->flags) ) { + time=p.degr_wfc_timeout; + if (time) WARN("using degr_wfc_timeout=%ld seconds\n", time); + } else { + time=p.wfc_timeout; + } + + time=time*HZ; + if(time==0) time=MAX_SCHEDULE_TIMEOUT; + + *tp=time; + + return 0; +} + +STATIC int drbd_ioctl_set_syncer(struct Drbd_Conf *mdev, + struct ioctl_syncer_config* arg) +{ + struct syncer_config sc; + int err; + + if(copy_from_user(&sc,&arg->config,sizeof(sc))) return -EFAULT; + + sc.use_csums = 0; // TODO, NYI + ERR_IF (sc.rate < 1) sc.rate = 1; + ERR_IF (sc.skip & ~1) sc.skip = !!sc.skip; + ERR_IF (sc.al_extents < 7) sc.al_extents = 127; // arbitrary minimum +#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) + if(sc.al_extents > AL_MAX) { + ERR("sc.al_extents > %d\n",AL_MAX); + sc.al_extents = AL_MAX; + } +#undef AL_MAX + + mdev->sync_conf.rate = sc.rate; + mdev->sync_conf.use_csums = sc.use_csums; + mdev->sync_conf.skip = sc.skip; + mdev->sync_conf.al_extents = sc.al_extents; + + err = drbd_check_al_size(mdev); + if (err) return err; + + if (mdev->cstate > WFConnection) + drbd_send_sync_param(mdev,&sc); + + drbd_alter_sg(mdev, sc.group); + + return 0; +} + +STATIC int drbd_detach_ioctl(drbd_dev *mdev) +{ + int would_discard_last_good_data; + int interrupted; + + // not during resync. no. + if (mdev->cstate > Connected) return -EBUSY; + + /* this was the last good data copy, if: + * (I am Primary, and not connected ), + * OR + * (we are connected, and Peer has no good data himself) + */ + would_discard_last_good_data = + ( mdev->state == Primary && mdev->cstate < Connected ) + || + ( mdev->cstate >= Connected + && ( test_bit(PARTNER_DISKLESS, &mdev->flags) + || !test_bit(PARTNER_CONSISTENT, &mdev->flags) ) ); + + if ( would_discard_last_good_data ) { + return -ENETRESET; + } + if (test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags) ) { + return -ENXIO; + } + + drbd_sync_me(mdev); + + set_bit(DISKLESS,&mdev->flags); + smp_wmb(); + + interrupted = wait_event_interruptible(mdev->cstate_wait, + atomic_read(&mdev->local_cnt)==0); + if ( interrupted ) { + clear_bit(DISKLESS,&mdev->flags); + return -EINTR; + } + + drbd_free_ll_dev(mdev); + +/* FIXME race with sync start +*/ + if (mdev->cstate == Connected) drbd_send_param(mdev,0); +/* FIXME +* if you detach while connected, you are *at least* inconsistent now, +* and should clear MDF_Consistent in metadata, and maybe even set the bitmap +* out of sync. +* since if you reattach, this might be a different lo dev, and then it needs +* to receive a sync! +*/ + if (mdev->cstate == StandAlone) { + // maybe < Connected is better? + set_cstate(mdev,Unconfigured); + drbd_mdev_cleanup(mdev); + module_put(THIS_MODULE); + } + return 0; +} + +#ifdef CONFIG_COMPAT +long drbd_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg) +{ + int ret; + // lock_kernel(); Not needed, since we have mdev->device_mutex + ret = drbd_ioctl(f->f_dentry->d_inode, f, cmd, arg); + // unlock_kernel(); + return ret; +} +#endif + +int drbd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int minor,err=0; + long time; + struct Drbd_Conf *mdev; + struct ioctl_wait* wp; +ONLY_IN_26( + struct block_device *bdev = inode->i_bdev; + struct gendisk *disk = bdev->bd_disk; +) + + minor = MINOR(inode->i_rdev); + if (minor >= minor_count) return -ENODEV; + mdev = drbd_conf + minor; + + D_ASSERT(MAJOR(inode->i_rdev) == MAJOR_NR); + + /* + * check whether we can permit this ioctl, and whether is makes sense. + * we don't care for the BLK* ioctls, with 2.6 they never end up here. + * + * for non-sysadmins, we only allow GET_CONFIG (and GET_VERSION) + * all other things need CAP_SYS_ADMIN. + * + * on an Unconfigured device, only configure requests make sense. + * still we silently ignore requests to become secondary or to + * unconfigure. other requests are invalid. + * + * I chose to have an additional switch statement for it + * because I think this makes it more obvious. + * + * because we look at mdev->cstate, it should be inside the lock + * (once we serialize cstate changes, it has to be...) + * + */ + if (!capable(CAP_SYS_ADMIN) + && cmd != DRBD_IOCTL_GET_CONFIG + && cmd != DRBD_IOCTL_GET_VERSION) { + err = -EPERM; + goto out_unlocked; + } + + if (mdev->cstate == Unconfigured) { + switch (cmd) { + default: + /* oops, unknown IOCTL ?? */ + err = -EINVAL; + goto out_unlocked; + + case DRBD_IOCTL_GET_CONFIG: + case DRBD_IOCTL_GET_VERSION: + break; /* always allowed */ + + case DRBD_IOCTL_SET_DISK_CONFIG: + case DRBD_IOCTL_SET_NET_CONFIG: + break; /* no restriction here */ + + case DRBD_IOCTL_UNCONFIG_DISK: + case DRBD_IOCTL_UNCONFIG_NET: + /* no op, so "drbdadm down all" does not fail */ + err = 0; + goto out_unlocked; + + /* the rest of them don't make sense if Unconfigured. + * still, set an Unconfigured device Secondary + * is allowed, so "drbdadm down all" does not fail */ + case DRBD_IOCTL_SET_STATE: + case DRBD_IOCTL_INVALIDATE: + case DRBD_IOCTL_INVALIDATE_REM: + case DRBD_IOCTL_SET_DISK_SIZE: + case DRBD_IOCTL_SET_STATE_FLAGS: + case DRBD_IOCTL_SET_SYNC_CONFIG: + case DRBD_IOCTL_WAIT_CONNECT: + case DRBD_IOCTL_WAIT_SYNC: + err = (cmd == DRBD_IOCTL_SET_STATE && arg == Secondary) + ? 0 : -ENXIO; + goto out_unlocked; + } + } + + if (unlikely(drbd_did_panic == DRBD_MAGIC)) + return -EBUSY; + + if( (err=down_interruptible(&mdev->device_mutex)) ) return err; + /* + * please no 'return', use 'err = -ERRNO; goto out;' + * we hold the device_mutex + */ + +ONLY_IN_26( + D_ASSERT(bdev == mdev->this_bdev); + D_ASSERT(disk == mdev->vdisk); +); + + smp_rmb(); + switch (cmd) { +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +/* see how sys_ioctl and blkdev_ioctl handle it in 2.6 . + * If I understand correctly, only "private" ioctl end up here. + */ + case BLKGETSIZE: + err = put_user(drbd_get_capacity(mdev->this_bdev),(long *)arg); + break; + +#ifdef BLKGETSIZE64 + case BLKGETSIZE64: /* see ./drivers/block/loop.c */ + err = put_user((u64)drbd_get_capacity(mdev->this_bdev)<<9, + (u64*)arg); + break; +#endif + + case BLKROSET: // THINK do we want to intercept this one ? + case BLKROGET: + case BLKFLSBUF: + case BLKSSZGET: + case BLKBSZGET: + case BLKBSZSET: // THINK do we want to intercept this one ? + case BLKPG: + err=blk_ioctl(inode->i_rdev, cmd, arg); + break; +#endif + case DRBD_IOCTL_GET_VERSION: + err = put_user(API_VERSION, (int *) arg); + break; + + case DRBD_IOCTL_SET_STATE: + if (arg & ~(Primary|Secondary|Human|TimeoutExpired| + DontBlameDrbd) ) { + err = -EINVAL; + } else { + err = drbd_set_state(mdev,arg); + } + break; + + case DRBD_IOCTL_SET_STATE_FLAGS: + if (arg & ~(Human|TimeoutExpired) ) { + err = -EINVAL; + } else { + clear_bit(ON_PRI_INC_HUMAN,&mdev->flags); + clear_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags); + if (arg == 0) break; + + // XXX reduce race: don't set it, + // if we have a connection. + // this does not avoid the race completely, though. + if (mdev->cstate > WFConnection) { + WARN("race avoidance: did not set " + "the state flags (%s), cstate=%s\n", + arg == (Human|TimeoutExpired) + ? "Human|TimeoutExpired" + : arg == Human + ? "Human" + : "TimeoutExpired", + cstate_to_name(mdev->cstate)); + break; + } + + if (arg & Human) + set_bit(ON_PRI_INC_HUMAN,&mdev->flags); + if (arg & TimeoutExpired) + set_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags); + } + break; + + case DRBD_IOCTL_SET_DISK_CONFIG: + err = drbd_ioctl_set_disk(mdev,(struct ioctl_disk_config*)arg); + break; + + case DRBD_IOCTL_SET_DISK_SIZE: + if (mdev->cstate > Connected) { + err = -EBUSY; + break; + } + if ( mdev->state == Secondary && mdev->o_state == Secondary) { + err = -EINPROGRESS; + break; + } + err=0; + mdev->lo_usize = (unsigned long)arg; + drbd_bm_lock(mdev); + drbd_determin_dev_size(mdev); + drbd_md_write(mdev); // Write mdev->la_size to disk. + drbd_bm_unlock(mdev); + if (mdev->cstate == Connected) drbd_send_param(mdev,1); + break; + + case DRBD_IOCTL_SET_NET_CONFIG: + err = drbd_ioctl_set_net(mdev,(struct ioctl_net_config*) arg); + break; + + case DRBD_IOCTL_SET_SYNC_CONFIG: + err = drbd_ioctl_set_syncer(mdev, + (struct ioctl_syncer_config*) arg); + break; + + case DRBD_IOCTL_GET_CONFIG: + err = drbd_ioctl_get_conf(mdev,(struct ioctl_get_config*) arg); + break; + + case DRBD_IOCTL_UNCONFIG_NET: + if ( mdev->cstate == Unconfigured) break; + if ( ( mdev->state == Primary + && test_bit(DISKLESS,&mdev->flags) ) + || ( mdev->o_state == Primary + && !test_bit(PARTNER_CONSISTENT,&mdev->flags) ) ) + { + err=-ENODATA; + break; + } + /* FIXME what if fsync returns error */ + drbd_sync_me(mdev); + set_bit(DO_NOT_INC_CONCNT,&mdev->flags); + set_cstate(mdev,Unconnected); + drbd_thread_stop(&mdev->receiver); + + if (test_bit(DISKLESS,&mdev->flags)) { + set_cstate(mdev,Unconfigured); + drbd_mdev_cleanup(mdev); + module_put(THIS_MODULE); + } else set_cstate(mdev,StandAlone); + + break; + + case DRBD_IOCTL_UNCONFIG_DISK: + if (mdev->cstate == Unconfigured) break; + err = drbd_detach_ioctl(mdev); + break; + + case DRBD_IOCTL_WAIT_CONNECT: + wp=(struct ioctl_wait*)arg; + if( (err=drbd_get_wait_time(&time,mdev,wp)) ) break; + + // We can drop the mutex, we do not touch anything in mdev. + up(&mdev->device_mutex); + + time = wait_event_interruptible_timeout( + mdev->cstate_wait, + mdev->cstate < Unconnected + || mdev->cstate >= Connected, + time ); + if (time < 0) { + err = time; + goto out_unlocked; + } + if (time == 0) { + err = -ETIME; + goto out_unlocked; + } + err=0; // no error + + if(put_user(mdev->cstate>=Connected,&wp->ret_code))err=-EFAULT; + goto out_unlocked; + + case DRBD_IOCTL_WAIT_SYNC: + wp=(struct ioctl_wait*)arg; + if( (err=drbd_get_wait_time(&time,mdev,wp)) ) break; + + up(&mdev->device_mutex); + + do { + time = wait_event_interruptible_timeout( + mdev->cstate_wait, + mdev->cstate == Connected + || mdev->cstate < Unconnected, + time ); + + if (time < 0 ) { + err = time; + goto out_unlocked; + } + + if (mdev->cstate > Connected) { + time=MAX_SCHEDULE_TIMEOUT; + } + + if (time == 0) { + err = -ETIME; + goto out_unlocked; + } + } while ( mdev->cstate != Connected + && mdev->cstate >= Unconnected ); + + err=0; // no error + + if(put_user(mdev->cstate==Connected,&wp->ret_code))err=-EFAULT; + goto out_unlocked; + + case DRBD_IOCTL_INVALIDATE: + /* TODO + * differentiate between different error cases, + * or report the current connection state and flags back + * to userspace */ + + /* disallow "invalidation" of local replica + * when currently in primary state (would be a Bad Idea), + * or during a running sync (won't make any sense) */ + if( (mdev->state == Primary || + (mdev->cstate != Connected && + mdev->cstate != StandAlone)) || + test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags) ) { + err = -EINPROGRESS; + break; + } + + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_clear_flag(mdev,MDF_Consistent); + drbd_md_write(mdev); + + if (mdev->cstate == Connected) { + /* avoid races with set_in_sync + * for successfull mirrored writes + */ + set_cstate(mdev,WFBitMapT); + wait_event(mdev->cstate_wait, + atomic_read(&mdev->ap_bio_cnt)==0); + } + + drbd_bm_lock(mdev); // racy... + + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_write(mdev); + + if (mdev->cstate >= Connected) { + drbd_send_short_cmd(mdev,BecomeSyncSource); + drbd_start_resync(mdev,SyncTarget); + } + + drbd_bm_unlock(mdev); + + break; + + case DRBD_IOCTL_INVALIDATE_REM: + if( mdev->o_state == Primary || + mdev->cstate != Connected || + test_bit(DISKLESS,&mdev->flags) || + test_bit(PARTNER_DISKLESS,&mdev->flags) ) { + err = -EINPROGRESS; + break; + } + if ( !drbd_md_test_flag(mdev,MDF_Consistent) ) { + // FIXME use a more descriptive error number + err = -EINVAL; + break; + } + + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_write(mdev); + + /* avoid races with set_in_sync + * for successfull mirrored writes + */ + set_cstate(mdev,WFBitMapS); + wait_event(mdev->cstate_wait, + atomic_read(&mdev->ap_bio_cnt)==0); + + drbd_bm_lock(mdev); // racy... + + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_write(mdev); + + drbd_send_short_cmd(mdev,BecomeSyncTarget); + drbd_start_resync(mdev,SyncSource); + + drbd_bm_unlock(mdev); + + break; + + default: + err = -EINVAL; + } + /* out: */ + up(&mdev->device_mutex); + out_unlocked: + return err; +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_int.h 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,1549 @@ +/* + drbd_int.h + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 1999-2004, Philipp Reisner . + main author. + + Copyright (C) 2002-2004, Lars Ellenberg . + main contributor. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ +#include +#include +#include +#include +#include +#include +#include + +#include "lru_cache.h" + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,8) +# define HAVE_KERNEL_SENDMSG 1 +#else +# define HAVE_KERNEL_SENDMSG 0 +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +#include "mempool.h" +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20) +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} +#endif + +// module parameter, defined in drbd_main.c +extern int minor_count; +extern int disable_bd_claim; +extern int major_nr; +extern int use_nbd_major; + +// use_nbd_major ? "nbd" : "drbd"; +extern char* drbd_devfs_name; + +#include +#ifdef DRBD_MAJOR +# warning "FIXME. DRBD_MAJOR is now officially defined in major.h" +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +/*lge: this hack is to get rid of the compiler warnings about + * 'do_nbd_request declared static but never defined' + * whilst forcing blk.h defines on + * though we probably do not need them, we do not use them... + * would not work without LOCAL_END_REQUEST + */ +# define MAJOR_NR DRBD_MAJOR +# define DEVICE_ON(device) +# define DEVICE_OFF(device) +# define DEVICE_NR(device) (MINOR(device)) +# define LOCAL_END_REQUEST +# include +# define DRBD_MAJOR major_nr +#else +# include +# include +# define MAJOR_NR major_nr +#endif + +#undef DEVICE_NAME +#define DEVICE_NAME "drbd" + +// XXX do we need this? +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +#define INITIAL_BLOCK_SIZE (1<<12) // 4K + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + * + * FIXME btw, we should register some reboot notifier. + */ +#define DRBD_SIGKILL SIGHUP + +#define ID_SYNCER (-1LL) +#define ID_VACANT 0 // All EEs on the free list should have this value + // freshly allocated EEs get !ID_VACANT (== 1) + // so if it says "cannot dereference null + // pointer at adress 0x00000001, it is most + // probably one of these :( + +struct Drbd_Conf; +typedef struct Drbd_Conf drbd_dev; + +#ifdef DBG_ALL_SYMBOLS +# define STATIC +#else +# define STATIC static +#endif + +#ifdef PARANOIA +# define PARANOIA_BUG_ON(x) BUG_ON(x) +#else +# define PARANOIA_BUG_ON(x) +#endif + +/* + * Some Message Macros + *************************/ + +// handy macro: DUMPP(somepointer) +#define DUMPP(A) ERR( #A " = %p in %s:%d\n", (A),__FILE__,__LINE__); +#define DUMPLU(A) ERR( #A " = %lu in %s:%d\n", (A),__FILE__,__LINE__); +#define DUMPLLU(A) ERR( #A " = %llu in %s:%d\n",(A),__FILE__,__LINE__); +#define DUMPLX(A) ERR( #A " = %lx in %s:%d\n", (A),__FILE__,__LINE__); +#define DUMPI(A) ERR( #A " = %d in %s:%d\n", (A),__FILE__,__LINE__); + +#define DUMPST(A) DUMPLLU((unsigned long long)(A)) + + +// Info: do not remove the spaces around the "," before ## +// Otherwise this is not portable from gcc-2.95 to gcc-3.3 +#define PRINTK(level,fmt,args...) \ + printk(level DEVICE_NAME "%d: " fmt, \ + (int)(mdev-drbd_conf) , ##args) + +#define ALERT(fmt,args...) PRINTK(KERN_ALERT, fmt , ##args) +#define ERR(fmt,args...) PRINTK(KERN_ERR, fmt , ##args) +#define WARN(fmt,args...) PRINTK(KERN_WARNING, fmt , ##args) +#define INFO(fmt,args...) PRINTK(KERN_INFO, fmt , ##args) +#define DBG(fmt,args...) PRINTK(KERN_DEBUG, fmt , ##args) + +/* see kernel/printk.c:printk_ratelimit + * macro, so it is easy do have independend rate limits at different locations + * "initializer element not constant ..." with kernel 2.4 :( + * so I initialize toks to something large + */ +#define DRBD_ratelimit(ratelimit_jiffies,ratelimit_burst) \ +({ \ + int __ret; \ + static unsigned long toks = 0x80000000UL; \ + static unsigned long last_msg; \ + static int missed; \ + unsigned long now = jiffies; \ + toks += now - last_msg; \ + last_msg = now; \ + if (toks > (ratelimit_burst * ratelimit_jiffies)) \ + toks = ratelimit_burst * ratelimit_jiffies; \ + if (toks >= ratelimit_jiffies) { \ + int lost = missed; \ + missed = 0; \ + toks -= ratelimit_jiffies; \ + if (lost) \ + WARN("%d messages suppressed in %s:%d.\n",\ + lost , __FILE__ , __LINE__ ); \ + __ret=1; \ + } else { \ + missed++; \ + __ret=0; \ + } \ + __ret; \ +}) + + +#ifdef DBG_ASSERTS +extern void drbd_assert_breakpoint(drbd_dev*, char *, char *, int ); +# define D_ASSERT(exp) if (!(exp)) \ + drbd_assert_breakpoint(mdev,#exp,__FILE__,__LINE__) +#else +# define D_ASSERT(exp) if (!(exp)) \ + ERR("ASSERT( " #exp " ) in %s:%d\n", __FILE__,__LINE__) +#endif +#define ERR_IF(exp) if (({ \ + int _b = (exp)!=0; \ + if (_b) ERR("%s: (" #exp ") in %s:%d\n", __func__, __FILE__,__LINE__); \ + _b; \ + })) + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,9) +#include +#else +// RH 2.4.9 does not have linux/stringify.h +#define __stringify_1(x) #x +#define __stringify(x) __stringify_1(x) +#endif + +// integer division, round _UP_ to the next integer +#define div_ceil(A,B) ( (A)/(B) + ((A)%(B) ? 1 : 0) ) +// usual integer division +#define div_floor(A,B) ( (A)/(B) ) + +/* + * Compatibility Section + *************************/ + +#include "drbd_compat_types.h" + +#ifdef SIGHAND_HACK +# define LOCK_SIGMASK(task,flags) spin_lock_irqsave(&task->sighand->siglock, flags) +# define UNLOCK_SIGMASK(task,flags) spin_unlock_irqrestore(&task->sighand->siglock, flags) +# define RECALC_SIGPENDING() recalc_sigpending(); +#else +# define LOCK_SIGMASK(task,flags) spin_lock_irqsave(&task->sigmask_lock, flags) +# define UNLOCK_SIGMASK(task,flags) spin_unlock_irqrestore(&task->sigmask_lock, flags) +# define RECALC_SIGPENDING() recalc_sigpending(current); +#endif + +#if defined(DBG_SPINLOCKS) && defined(__SMP__) +# define MUST_HOLD(lock) if(!spin_is_locked(lock)) { ERR("Not holding lock! in %s\n", __FUNCTION__ ); } +#else +# define MUST_HOLD(lock) +#endif + +/* + * our structs + *************************/ + +#ifndef typecheck +/* + * Check at compile time that something is of a particular type. + * Always evaluates to 1 so you may use it easily in comparisons. + */ +#define typecheck(type,x) \ +({ type __dummy; \ + typeof(x) __dummy2; \ + (void)(&__dummy == &__dummy2); \ + 1; \ +}) +#endif + +#define SET_MAGIC(x) ((x)->magic = (long)(x) ^ DRBD_MAGIC) +#define VALID_POINTER(x) ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)):0) +#define INVALIDATE_MAGIC(x) (x->magic--) + +#define SET_MDEV_MAGIC(x) \ + ({ typecheck(struct Drbd_Conf*,x); \ + (x)->magic = (long)(x) ^ DRBD_MAGIC; }) +#define IS_VALID_MDEV(x) \ + ( typecheck(struct Drbd_Conf*,x) && \ + ((x) ? (((x)->magic ^ DRBD_MAGIC) == (long)(x)):0)) + +/* these defines should go into blkdev.h + (if it will be ever includet into linus' linux) */ +#define RQ_DRBD_NOTHING 0x0001 +#define RQ_DRBD_SENT 0x0010 +#define RQ_DRBD_LOCAL 0x0020 +#define RQ_DRBD_DONE 0x0030 +#define RQ_DRBD_IN_TL 0x0040 + +enum MetaDataFlags { + __MDF_Consistent, + __MDF_PrimaryInd, + __MDF_ConnectedInd, + __MDF_FullSync, +}; +#define MDF_Consistent (1<<__MDF_Consistent) +#define MDF_PrimaryInd (1<<__MDF_PrimaryInd) +#define MDF_ConnectedInd (1<<__MDF_ConnectedInd) +#define MDF_FullSync (1<<__MDF_FullSync) + +/* drbd_meta-data.c (still in drbd_main.c) */ +enum MetaDataIndex { + Flags, /* Consistency flag,connected-ind,primary-ind */ + HumanCnt, /* human-intervention-count */ + TimeoutCnt, /* timout-count */ + ConnectedCnt, /* connected-count */ + ArbitraryCnt, /* arbitrary-count */ + GEN_CNT_SIZE // MUST BE LAST! (and Flags must stay first...) +}; + +#define DRBD_MD_MAGIC (DRBD_MAGIC+3) // 3nd incarnation of the file format. + +#define DRBD_PANIC 2 +/* do_panic alternatives: + * 0: panic(); + * 1: machine_halt; SORRY, this DOES NOT WORK + * 2: prink(EMERG ), plus flag to fail all eventual drbd IO, plus panic() + */ + +extern volatile int drbd_did_panic; + +#if DRBD_PANIC == 0 +#define drbd_panic(fmt, args...) \ + panic(DEVICE_NAME "%d: " fmt, (int)(mdev-drbd_conf) , ##args) +#elif DRBD_PANIC == 1 +#error "sorry , this does not work, please contribute" +#else +#define drbd_panic(fmt, args...) do { \ + printk(KERN_EMERG DEVICE_NAME "%d: " fmt, \ + (int)(mdev-drbd_conf) , ##args); \ + drbd_did_panic = DRBD_MAGIC; \ + smp_mb(); \ + panic(DEVICE_NAME "%d: " fmt, (int)(mdev-drbd_conf) , ##args); \ +} while (0) +#endif +#undef DRBD_PANIC + +/*** + * on the wire + *********************************************************************/ + +typedef enum { + Data, + DataReply, // Response to DataRequest + RSDataReply, // Response to RSDataRequest + Barrier, + ReportParams, + ReportBitMap, + BecomeSyncTarget, + BecomeSyncSource, + UnplugRemote, // Used at various times to hint the peer to hurry up + DataRequest, // Used to ask for a data block + RSDataRequest, // Used to ask for a data block + SyncParam, + + Ping, // These are sent on the meta socket... + PingAck, + RecvAck, // Used in protocol B + WriteAck, // Used in protocol C + NegAck, // Sent if local disk is unusable + NegDReply, // Local disk is broken... + NegRSDReply, // Local disk is broken... + BarrierAck, + + MAX_CMD, + MayIgnore = 0x100, // Flag only to test if (cmd > MayIgnore) ... + MAX_OPT_CMD, + + HandShake = 0xfffe // FIXED for the next century! +} Drbd_Packet_Cmd; + +static inline const char* cmdname(Drbd_Packet_Cmd cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [Data] = "Data", + [DataReply] = "DataReply", + [RSDataReply] = "RSDataReply", + [Barrier] = "Barrier", + [ReportParams] = "ReportParams", + [ReportBitMap] = "ReportBitMap", + [BecomeSyncTarget] = "BecomeSyncTarget", + [BecomeSyncSource] = "BecomeSyncSource", + [UnplugRemote] = "UnplugRemote", + [DataRequest] = "DataRequest", + [RSDataRequest] = "RSDataRequest", + [SyncParam] = "SyncParam", + [Ping] = "Ping", + [PingAck] = "PingAck", + [RecvAck] = "RecvAck", + [WriteAck] = "WriteAck", + [NegAck] = "NegAck", + [NegDReply] = "NegDReply", + [NegRSDReply] = "NegRSDReply", + [BarrierAck] = "BarrierAck" + }; + + if (cmd == HandShake) return "HandShake"; + if (Data > cmd || cmd >= MAX_CMD) return "Unknown"; + return cmdnames[cmd]; +} + + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +typedef struct { + u32 magic; + u16 command; + u16 length; // bytes of data after this header + char payload[0]; +} __attribute((packed)) Drbd_Header; +// 8 bytes. packet FIXED for the next century! + +/* + * short commands, packets without payload, plain Drbd_Header: + * Ping + * PingAck + * BecomeSyncTarget + * BecomeSyncSource + * UnplugRemote + */ + +/* + * commands with out-of-struct payload: + * ReportBitMap (no additional fields) + * Data, DataReply (see Drbd_Data_Packet) + */ +typedef struct { + Drbd_Header head; + u64 sector; // 64 bits sector number + u64 block_id; // Used in protocol B&C for the address of the req. +} __attribute((packed)) Drbd_Data_Packet; + +/* + * commands which share a struct: + * RecvAck (proto B), WriteAck (proto C) (see Drbd_BlockAck_Packet) + * DataRequest, RSDataRequest (see Drbd_BlockRequest_Packet) + */ +typedef struct { + Drbd_Header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; //make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_BlockAck_Packet; + +typedef struct { + Drbd_Header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; //make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_BlockRequest_Packet; + +/* + * commands with their own struct for additional fields: + * HandShake + * Barrier + * BarrierAck + * SyncParam + * ReportParams + */ + +typedef struct { + Drbd_Header head; // 8 bytes + u32 protocol_version; + u32 feature_flags; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserverd array shall be zero. + */ + + u64 reserverd[8]; +} __attribute((packed)) Drbd_HandShake_Packet; +// 80 bytes, FIXED for the next century + +typedef struct { + Drbd_Header head; + u32 barrier; // may be 0 or a barrier number + u32 pad; //make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_Barrier_Packet; + +typedef struct { + Drbd_Header head; + u32 barrier; + u32 set_size; +} __attribute((packed)) Drbd_BarrierAck_Packet; + +typedef struct { + Drbd_Header head; + u32 rate; + u32 use_csums; + u32 skip; + u32 group; +} __attribute((packed)) Drbd_SyncParam_Packet; + +/* FIXME add more members here, until we introduce a new fixed size + * protocol version handshake packet! */ +typedef struct { + Drbd_Header head; + u64 p_size; // size of disk + u64 u_size; // user requested size + u32 state; + u32 protocol; + u32 version; + u32 gen_cnt[GEN_CNT_SIZE]; + u32 sync_rate; + u32 sync_use_csums; + u32 skip_sync; + u32 sync_group; + u32 flags; // flags & 1 -> reply call drbd_send_param(mdev); + u32 magic; //make sure packet is a multiple of 8 Byte +} __attribute((packed)) Drbd_Parameter_Packet; + +typedef struct { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __attribute((packed)) Drbd06_Parameter_P; + +typedef union { + Drbd_Header head; + Drbd_HandShake_Packet HandShake; + Drbd_Data_Packet Data; + Drbd_BlockAck_Packet BlockAck; + Drbd_Barrier_Packet Barrier; + Drbd_BarrierAck_Packet BarrierAck; + Drbd_SyncParam_Packet SyncParam; + Drbd_Parameter_Packet Parameter; + Drbd_BlockRequest_Packet BlockRequest; +} __attribute((packed)) Drbd_Polymorph_Packet; + +/**********************************************************************/ + +typedef enum { + None, + Running, + Exiting, + Restarting +} Drbd_thread_state; + +struct Drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion startstop; + Drbd_thread_state t_state; + int (*function) (struct Drbd_thread *); + drbd_dev *mdev; +}; + +static inline Drbd_thread_state get_t_state(struct Drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return (volatile int)thi->t_state; +} + + +/* + * Having this as the first member of a struct provides sort of "inheritance". + * "derived" structs can be "drbd_queue_work()"ed. + * The callback should know and cast back to the descendant struct. + * drbd_request and Tl_epoch_entry are descendants of drbd_work. + */ +struct drbd_work; +typedef int (*drbd_work_cb)(drbd_dev*, struct drbd_work*, int cancel); +struct drbd_work { + struct list_head list; + drbd_work_cb cb; +}; + +/* + * since we eventually don't want to "remap" any bhs, but allways need a + * private bh, it may as well be part of the struct so we do not need to + * allocate it separately. it is only used as a clone, and since we own it, we + * can abuse certain fields of if for our own needs. and, since it is part of + * the struct, we can use b_private for other things than the req, e.g. mdev, + * since we get the request struct by means of the "container_of()" macro. + * -lge + */ + +struct drbd_barrier; +struct drbd_request { + struct drbd_work w; + long magic; + int rq_status; + struct drbd_barrier *barrier; // The next barrier. + drbd_bio_t *master_bio; // master bio pointer +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + drbd_bio_t private_bio; // private bio struct +#else + struct bio *private_bio; + drbd_dev *mdev; +#endif +}; + +struct drbd_barrier { + struct list_head requests; // requests before + struct drbd_barrier *next; // pointer to the next barrier + int br_number; // the barriers identifier. + int n_req; // number of requests attached before this barrier +}; + +typedef struct drbd_request drbd_request_t; + +/* These Tl_epoch_entries may be in one of 6 lists: + free_ee .. free entries + active_ee .. data packet being written + sync_ee .. syncer block being written + done_ee .. block written, need to send WriteAck + read_ee .. [RS]DataRequest being read +*/ + +/* Since whenever we allocate a Tl_epoch_entry, we allocated a buffer_head, + * at the same time, we might as well put it as member into the struct. + * Yes, we may "waste" a little memory since the unused EEs on the free_ee list + * are somewhat larger. For 2.6, this will be a struct_bio, which is fairly + * small, and since we adopt the amount dynamically anyways, this is not an + * issue. + * + * TODO + * I'd like to "drop" the free list altogether, since we use mempools, which + * are designed for this. We probably would still need a private "page pool" + * to do the "bio_add_page" from. + * -lge + */ +struct Tl_epoch_entry { + struct drbd_work w; + drbd_bio_t private_bio; // private bio struct, NOT a pointer + u64 block_id; + long magic; + ONLY_IN_26(unsigned int ee_size;) + ONLY_IN_26(sector_t ee_sector;) + // THINK: maybe we rather want bio_alloc(GFP_*,1) + ONLY_IN_26(struct bio_vec ee_bvec;) +}; + +/* flag bits */ +enum { + ISSUE_BARRIER, // next Data is preceeded by a Barrier + SIGNAL_ASENDER, // whether asender wants to be interrupted + SEND_PING, // whether asender should send a ping asap + WRITER_PRESENT, // somebody opened us with write intent + STOP_SYNC_TIMER, // tell timer to cancel itself + DO_NOT_INC_CONCNT, // well, don't ... + ON_PRI_INC_HUMAN, // When we become primary increase human-count + ON_PRI_INC_TIMEOUTEX, // When " - " increase timeout-count + UNPLUG_QUEUED, // only relevant with kernel 2.4 + UNPLUG_REMOTE, // whether sending a "UnplugRemote" makes sense + DISKLESS, // no local disk + PARTNER_DISKLESS, // partner has no storage + PARTNER_CONSISTENT, // partner has consistent data + PROCESS_EE_RUNNING, // eek! + MD_IO_ALLOWED, // EXPLAIN + SENT_DISK_FAILURE, // sending it once is enough + MD_DIRTY, // current gen counts and flags not yet on disk + SYNC_STARTED, // Needed to agree on the exact point in time.. + USE_DEGR_WFC_T, // Use degr-wfc-timeout instad of wfc-timeout. +}; + +struct drbd_bitmap; // opaque for Drbd_Conf + +// TODO sort members for performance +// MAYBE group them further + +/* THINK maybe we actually want to use the default "event/%s" worker threads + * or similar in linux 2.6, which uses per cpu data and threads. + * + * To be general, this might need a spin_lock member. + * For now, please use the mdev->req_lock to protect list_head, + * see drbd_queue_work below. + */ +struct drbd_work_queue { + struct list_head q; + struct semaphore s; // producers up it, worker down()s it +}; + +/* If Philipp agrees, we remove the "mutex", and make_request will only + * (throttle on "queue full" condition and) queue it to the worker thread... + * which then is free to do whatever is needed, and has exclusive send access + * to the data socket ... + */ +struct drbd_socket { + struct drbd_work_queue work; + struct semaphore mutex; + struct socket *socket; + Drbd_Polymorph_Packet sbuf; // this way we get our + Drbd_Polymorph_Packet rbuf; // send/receive buffers off the stack +}; + +struct Drbd_Conf { +#ifdef PARANOIA + long magic; +#endif + struct net_config conf; + struct syncer_config sync_conf; + enum io_error_handler on_io_error; + struct semaphore device_mutex; + struct drbd_socket data; // for data/barrier/cstate/parameter packets + struct drbd_socket meta; // for ping/ack (metadata) packets + volatile unsigned long last_received; // in jiffies, either socket + volatile unsigned int ko_count; + struct drbd_work resync_work, + barrier_work, + unplug_work; + struct timer_list resync_timer; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + kdev_t backing_bdev; // backing device + kdev_t this_bdev; + kdev_t md_bdev; // device for meta-data. +#else + struct block_device *backing_bdev; + struct block_device *this_bdev; + struct block_device *md_bdev; + struct gendisk *vdisk; + request_queue_t *rq_queue; +#endif + // THINK is this the same in 2.6.x ?? + struct file *lo_file; + struct file *md_file; + int md_index; + sector_t lo_usize; /* user provided size */ + sector_t p_size; /* partner's disk size */ + Drbd_State state; + volatile Drbd_CState cstate; + wait_queue_head_t cstate_wait; // TODO Rename into "misc_wait". + Drbd_State o_state; + sector_t la_size; // last agreed disk size + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; // Requests we need to complete + atomic_t ap_pending_cnt; // AP data packets on the wire, ack expected + atomic_t rs_pending_cnt; // RS request/data packets on the wire + atomic_t unacked_cnt; // Need to send replys for + atomic_t local_cnt; // Waiting for local disk to signal completion + spinlock_t req_lock; + spinlock_t tl_lock; + struct drbd_barrier* newest_barrier; + struct drbd_barrier* oldest_barrier; + unsigned long flags; + struct task_struct *send_task; /* about pid calling drbd_send */ + spinlock_t send_task_lock; + // sector_t rs_left; // blocks not up-to-date [unit BM_BLOCK_SIZE] + // moved into bitmap->bm_set + unsigned long rs_total; // blocks to sync in this run [unit BM_BLOCK_SIZE] + unsigned long rs_start; // Syncer's start time [unit jiffies] + unsigned long rs_paused; // cumulated time in PausedSyncX state [unit jiffies] + unsigned long rs_mark_left;// block not up-to-date at mark [unit BM_BLOCK_SIZE] + unsigned long rs_mark_time;// marks's time [unit jiffies] + struct Drbd_thread receiver; + struct Drbd_thread worker; + struct Drbd_thread asender; + struct drbd_bitmap* bitmap; + struct lru_cache* resync; // Used to track operations of resync... + atomic_t resync_locked; // Number of locked elements in resync LRU + int open_cnt; + u32 gen_cnt[GEN_CNT_SIZE]; + atomic_t epoch_size; + spinlock_t ee_lock; + struct list_head free_ee; // available + struct list_head active_ee; // IO in progress + struct list_head sync_ee; // IO in progress + struct list_head done_ee; // send ack + struct list_head read_ee; // IO in progress + struct list_head net_ee; // zero-copy network send in progress + spinlock_t pr_lock; + struct list_head app_reads; + struct list_head resync_reads; + int ee_vacant; + int ee_in_use; + wait_queue_head_t ee_wait; + NOT_IN_26(struct tq_struct write_hint_tq;) + struct page *md_io_page; // one page buffer for md_io + struct page *md_io_tmpp; // in case hardsect != 512 [ s390 only? ] + struct semaphore md_io_mutex; // protects the md_io_buffer + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache* act_log; // activity log + unsigned int al_tr_number; + int al_tr_cycle; + int al_tr_pos; // position of the next transaction in the journal +}; + + +/* + * function declarations + *************************/ + +// drbd_main.c +extern void _set_cstate(drbd_dev* mdev,Drbd_CState cs); +extern void drbd_thread_start(struct Drbd_thread *thi); +extern void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait); +extern void drbd_free_resources(drbd_dev *mdev); +extern void tl_release(drbd_dev *mdev,unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(drbd_dev *mdev); +extern int tl_dependence(drbd_dev *mdev, drbd_request_t * item); +extern void drbd_free_sock(drbd_dev *mdev); +extern int drbd_send(drbd_dev *mdev, struct socket *sock, + void* buf, size_t size, unsigned msg_flags); +extern int drbd_send_param(drbd_dev *mdev, int flags); +extern int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header *h, + size_t size, unsigned msg_flags); +extern int drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header *h, size_t size); +extern int drbd_send_sync_param(drbd_dev *mdev, struct syncer_config *sc); +extern int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e); +extern int _drbd_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size); +extern int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e); +extern int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req); +extern int _drbd_send_barrier(drbd_dev *mdev); +extern int drbd_send_drequest(drbd_dev *mdev, int cmd, + sector_t sector,int size, u64 block_id); +extern int drbd_send_bitmap(drbd_dev *mdev); +extern int _drbd_send_bitmap(drbd_dev *mdev); +extern void drbd_free_ll_dev(drbd_dev *mdev); +extern int drbd_io_error(drbd_dev* mdev); +extern void drbd_mdev_cleanup(drbd_dev *mdev); + +// drbd_meta-data.c (still in drbd_main.c) +extern void drbd_md_write(drbd_dev *mdev); +extern int drbd_md_read(drbd_dev *mdev); +extern int drbd_md_compare(drbd_dev *mdev,Drbd_Parameter_Packet *partner); +extern void drbd_dump_md(drbd_dev *, Drbd_Parameter_Packet *, int ); +// maybe define them below as inline? +extern void drbd_md_inc(drbd_dev *mdev, enum MetaDataIndex order); +extern void drbd_md_set_flag(drbd_dev *mdev, int flags); +extern void drbd_md_clear_flag(drbd_dev *mdev, int flags); +extern int drbd_md_test_flag(drbd_dev *mdev, int flag); + +/* Meta data layout + We reserve a 128MB Block (4k aligned) + * either at the end of the backing device + * or on a seperate meta data device. */ + +#define MD_RESERVED_SIZE ( 128LU * (1<<10) ) // 128 MB ( in units of kb ) +// The following numbers are sectors +#define MD_GC_OFFSET 0 +#define MD_AL_OFFSET 8 // 8 Sectors after start of meta area +#define MD_AL_MAX_SIZE 64 // = 32 kb LOG ~ 3776 extents ~ 14 GB Storage +#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) //Allows up to about 3.8TB + +#define MD_HARDSECT_B 9 // Since the smalles IO unit is usually 512 byte +#define MD_HARDSECT (1< we need 32 KB bitmap. + * Bit 0 ==> local node thinks this block is binary identical on both nodes + * Bit 1 ==> local node thinks this block needs to be synced. + */ + +#define BM_BLOCK_SIZE_B 12 // 4k per bit +#define BM_BLOCK_SIZE (1<>(BM_BLOCK_SIZE_B-9)) +#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SIZE_B-9)) +#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) + +/* bit to represented kilo byte conversion */ +#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SIZE_B-10)) + +/* in which _bitmap_ extent (resp. sector) the bit for a certain + * _storage_ sector is located in */ +#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SIZE_B-9)) + +/* in one sector of the bitmap, we have this many activity_log extents. */ +#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SIZE_B - AL_EXTENT_SIZE_B) ) +#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SIZE_B-BM_BLOCK_SIZE_B-LN2_BPL)) + + +/* I want the packet to fit within one page + * THINK maybe use a special bitmap header, + * including offset and compression scheme and whatnot + * Do not use PAGE_SIZE here! Use a architecture agnostic constant! + */ +#define BM_PACKET_WORDS ((4096-sizeof(Drbd_Header))/sizeof(long)) + +/* the extent in "PER_EXTENT" below is an activity log extent + * we need that many (long words/bytes) to store the bitmap + * of one AL_EXTENT_SIZE chunk of storage. + * we can store the bitmap for that many AL_EXTENTS within + * one sector of the _on_disk_ bitmap: + * bit 0 bit 37 bit 38 bit (512*8)-1 + * ...|........|........|.. // ..|........| + * sect. 0 `296 `304 ^(512*8*8)-1 + * +#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG ) +#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128 +#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4 + */ + +#define DRBD_MAX_SECTORS_32 (0xffffffffLU) +#define DRBD_MAX_SECTORS_BM \ + ( (MD_RESERVED_SIZE*2LL - MD_BM_OFFSET) * (1LL<<(BM_EXT_SIZE_B-9)) ) +#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32 +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM +#elif ( !defined(CONFIG_LBD) ) && ( BITS_PER_LONG == 32 ) +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32 +#else +#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM +#endif + +extern int drbd_bm_init (drbd_dev *mdev); +extern int drbd_bm_resize (drbd_dev *mdev, sector_t sectors); +extern void drbd_bm_cleanup (drbd_dev *mdev); +extern void drbd_bm_set_all (drbd_dev *mdev); +extern void drbd_bm_clear_all (drbd_dev *mdev); +extern void drbd_bm_reset_find(drbd_dev *mdev); +extern int drbd_bm_set_bit (drbd_dev *mdev, unsigned long bitnr); +extern int drbd_bm_test_bit (drbd_dev *mdev, unsigned long bitnr); +extern int drbd_bm_clear_bit (drbd_dev *mdev, unsigned long bitnr); +extern int drbd_bm_e_weight (drbd_dev *mdev, unsigned long enr); +extern int drbd_bm_read_sect (drbd_dev *mdev, unsigned long enr); +extern int drbd_bm_write_sect(drbd_dev *mdev, unsigned long enr); +extern void drbd_bm_read (drbd_dev *mdev); +extern void drbd_bm_write (drbd_dev *mdev); +extern unsigned long drbd_bm_ALe_set_all (drbd_dev *mdev, unsigned long al_enr); +extern size_t drbd_bm_words (drbd_dev *mdev); +extern sector_t drbd_bm_capacity (drbd_dev *mdev); +extern unsigned long drbd_bm_find_next (drbd_dev *mdev); +extern unsigned long drbd_bm_total_weight(drbd_dev *mdev); +extern int drbd_bm_rs_done(drbd_dev *mdev); +// for receive_bitmap +extern void drbd_bm_merge_lel (drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer); +// for _drbd_send_bitmap and drbd_bm_write_sect +extern void drbd_bm_get_lel (drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer); +/* + * only used by drbd_bm_read_sect +extern void drbd_bm_set_lel (drbd_dev *mdev, size_t offset, size_t number, + unsigned long* buffer); +*/ + +extern void __drbd_bm_lock (drbd_dev *mdev, char* file, int line); +extern void drbd_bm_unlock (drbd_dev *mdev); +#define drbd_bm_lock(mdev) __drbd_bm_lock(mdev, __FILE__, __LINE__ ) + + +// drbd_main.c +extern drbd_dev *drbd_conf; +extern int minor_count; +extern kmem_cache_t *drbd_request_cache; +extern kmem_cache_t *drbd_ee_cache; +extern mempool_t *drbd_request_mempool; + +// drbd_req +#define ERF_NOTLD 2 /* do not call tl_dependence */ +extern void drbd_end_req(drbd_request_t *, int, int, sector_t); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +extern int drbd_make_request_24(request_queue_t *q, int rw, struct buffer_head *bio); +#else +extern int drbd_make_request_26(request_queue_t *q, struct bio *bio); +#endif +extern int drbd_read_remote(drbd_dev *mdev, drbd_request_t *req); + +// drbd_fs.c +extern char* ppsize(char* buf, size_t size); +extern int drbd_determin_dev_size(drbd_dev*); +extern sector_t drbd_new_dev_size(struct Drbd_Conf*); +extern int drbd_set_state(drbd_dev *mdev,Drbd_State newstate); +extern int drbd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +extern long drbd_compat_ioctl(struct file *f, unsigned cmd, unsigned long arg); + +// drbd_worker.c +extern int drbd_worker(struct Drbd_thread *thi); +extern void drbd_alter_sg(drbd_dev *mdev, int ng); +extern void drbd_start_resync(drbd_dev *mdev, Drbd_CState side); +extern int drbd_resync_finished(drbd_dev *mdev); +// maybe rather drbd_main.c ? +extern int drbd_md_sync_page_io(drbd_dev *mdev, sector_t sector, int rw); +// worker callbacks +extern int w_is_app_read (drbd_dev *, struct drbd_work *, int); +extern int w_is_resync_read (drbd_dev *, struct drbd_work *, int); +extern int w_read_retry_remote (drbd_dev *, struct drbd_work *, int); +extern int w_e_end_data_req (drbd_dev *, struct drbd_work *, int); +extern int w_e_end_rsdata_req (drbd_dev *, struct drbd_work *, int); +extern int w_resync_inactive (drbd_dev *, struct drbd_work *, int); +extern int w_resume_next_sg (drbd_dev *, struct drbd_work *, int); +extern int w_io_error (drbd_dev *, struct drbd_work *, int); +extern int w_try_send_barrier (drbd_dev *, struct drbd_work *, int); +extern int w_send_write_hint (drbd_dev *, struct drbd_work *, int); +extern int w_make_resync_request (drbd_dev *, struct drbd_work *, int); +extern void resync_timer_fn(unsigned long data); + +// drbd_receiver.c +extern int drbd_release_ee(drbd_dev* mdev,struct list_head* list); +extern int drbd_init_ee(drbd_dev* mdev); +extern void drbd_put_ee(drbd_dev* mdev,struct Tl_epoch_entry *e); +extern struct Tl_epoch_entry* drbd_get_ee(drbd_dev* mdev); +extern void drbd_wait_ee(drbd_dev *mdev,struct list_head *head); + +// drbd_proc.c +extern struct proc_dir_entry *drbd_proc; +extern struct file_operations drbd_proc_fops; +extern const char* cstate_to_name(Drbd_CState s); +extern const char* nodestate_to_name(Drbd_State s); + +// drbd_actlog.c +extern void drbd_al_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_al_complete_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_rs_complete_io(struct Drbd_Conf *mdev, sector_t sector); +extern int drbd_rs_begin_io(struct Drbd_Conf *mdev, sector_t sector); +extern void drbd_rs_cancel_all(drbd_dev* mdev); +extern int drbd_al_read_log(struct Drbd_Conf *mdev); +extern void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line); +#define drbd_set_in_sync(mdev,sector,size) \ + __drbd_set_in_sync(mdev,sector,size, __FILE__, __LINE__ ) +extern void __drbd_set_out_of_sync(drbd_dev* mdev, sector_t sector, int size, const char* file, const unsigned int line); +#define drbd_set_out_of_sync(mdev,sector,size) \ + __drbd_set_out_of_sync(mdev,sector,size, __FILE__, __LINE__ ) +extern void drbd_al_apply_to_bm(struct Drbd_Conf *mdev); +extern void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev); +extern void drbd_al_shrink(struct Drbd_Conf *mdev); + +/* + * event macros + *************************/ + +// sched.h does not have it with timeout, so here goes: + +#ifndef wait_event_interruptible_timeout +#define __wait_event_interruptible_timeout(wq, condition, ret) \ +do { \ + wait_queue_t __wait; \ + init_waitqueue_entry(&__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (condition) \ + break; \ + if (!signal_pending(current)) { \ + ret = schedule_timeout(ret); \ + if (!ret) \ + break; \ + continue; \ + } \ + ret = -EINTR; \ + break; \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#define wait_event_interruptible_timeout(wq, condition, timeout) \ +({ \ + long __ret = timeout; \ + if (!(condition)) \ + __wait_event_interruptible_timeout(wq, condition, __ret); \ + __ret; \ +}) +#endif + +/* + * inline helper functions + *************************/ + +#include "drbd_compat_wrappers.h" + +static inline int drbd_disk_less_node_present(struct Drbd_Conf* mdev) +{ + sector_t p_size = mdev->p_size; + sector_t m_size = drbd_get_capacity(mdev->backing_bdev); + + return ! ( p_size && m_size ) ; +} + +static inline void +drbd_flush_signals(struct task_struct *t) +{ + NOT_IN_26( + unsigned long flags; + LOCK_SIGMASK(t,flags); + ) + + flush_signals(t); + NOT_IN_26(UNLOCK_SIGMASK(t,flags)); +} + +static inline void set_cstate(drbd_dev* mdev,Drbd_CState ns) +{ + unsigned long flags; + spin_lock_irqsave(&mdev->req_lock,flags); + _set_cstate(mdev,ns); + spin_unlock_irqrestore(&mdev->req_lock,flags); +} + +/** + * drbd_chk_io_error: Handles the on_io_error setting, should be called from + * all io completion handlers. See also drbd_io_error(). + */ +static inline void drbd_chk_io_error(drbd_dev* mdev, int error) +{ + if (error) { + switch(mdev->on_io_error) { + case PassOn: + ERR("Ignoring local IO error!\n"); + break; + case Panic: + set_bit(DISKLESS,&mdev->flags); + smp_mb(); // but why is there smp_mb__after_clear_bit() ? + drbd_panic("IO error on backing device!\n"); + break; + case Detach: + /*lge: + * I still do not fully grasp when to set or clear + * this flag... but I want to be able to at least + * still _try_ and write the "I am inconsistent, and + * need full sync" information to the MD. */ + set_bit(MD_IO_ALLOWED,&mdev->flags); + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_clear_flag(mdev,MDF_Consistent); + if (!test_and_set_bit(DISKLESS,&mdev->flags)) { + smp_mb(); // Nack is sent in w_e handlers. + ERR("Local IO failed. Detaching...\n"); + } + break; + } + } +} + +static inline int semaphore_is_locked(struct semaphore* s) +{ + if(!down_trylock(s)) { + up(s); + return 0; + } + return 1; +} +/* Returns the start sector for metadata, aligned to 4K + * which happens to be the capacity we announce for + * our lower level device if it includes the meta data + */ +static inline sector_t drbd_md_ss(drbd_dev *mdev) +{ + if( mdev->md_index == -1 ) { + if (!mdev->backing_bdev) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("mdev->backing_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + return ( (drbd_get_capacity(mdev->backing_bdev) & ~7L) + - (MD_RESERVED_SIZE<<1) ); + } else { + return 2 * MD_RESERVED_SIZE * mdev->md_index; + } +} + +static inline void +_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + list_add_tail(&w->list,&q->q); + up(&q->s); +} + +static inline void +_drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) +{ + list_add(&w->list,&q->q); + up(&q->s); +} + +static inline void +drbd_queue_work_front(drbd_dev *mdev, struct drbd_work_queue *q, + struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&mdev->req_lock,flags); + list_add(&w->list,&q->q); + spin_unlock_irqrestore(&mdev->req_lock,flags); + up(&q->s); +} + +static inline void +drbd_queue_work(drbd_dev *mdev, struct drbd_work_queue *q, + struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&mdev->req_lock,flags); + list_add_tail(&w->list,&q->q); + spin_unlock_irqrestore(&mdev->req_lock,flags); + up(&q->s); +} + +static inline void wake_asender(drbd_dev *mdev) { + if(test_bit(SIGNAL_ASENDER, &mdev->flags)) { + force_sig(DRBD_SIG, mdev->asender.task); + } +} + +static inline void request_ping(drbd_dev *mdev) { + set_bit(SEND_PING,&mdev->flags); + wake_asender(mdev); +} + +static inline int drbd_send_short_cmd(drbd_dev *mdev, Drbd_Packet_Cmd cmd) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,mdev->data.socket,cmd,&h,sizeof(h)); +} + +static inline int drbd_send_ping(drbd_dev *mdev) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,mdev->meta.socket,Ping,&h,sizeof(h)); +} + +static inline int drbd_send_ping_ack(drbd_dev *mdev) +{ + Drbd_Header h; + return drbd_send_cmd(mdev,mdev->meta.socket,PingAck,&h,sizeof(h)); +} + +static inline void drbd_thread_stop(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,FALSE,TRUE); +} + +static inline void drbd_thread_stop_nowait(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,FALSE,FALSE); +} + +static inline void drbd_thread_restart_nowait(struct Drbd_thread *thi) +{ + _drbd_thread_stop(thi,TRUE,FALSE); +} + +static inline void inc_ap_pending(drbd_dev* mdev) +{ + atomic_inc(&mdev->ap_pending_cnt); +} + +#define ERR_IF_CNT_IS_NEGATIVE(which) \ + if(atomic_read(&mdev->which)<0) \ + ERR("in %s:%d: " #which " = %d < 0 !\n", \ + __func__ , __LINE__ , \ + atomic_read(&mdev->which)) + +#define dec_ap_pending(mdev) \ + typecheck(drbd_dev*,mdev); \ + if(atomic_dec_and_test(&mdev->ap_pending_cnt)) \ + wake_up(&mdev->cstate_wait); \ + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt) + +static inline void inc_rs_pending(drbd_dev* mdev) +{ + atomic_inc(&mdev->rs_pending_cnt); +} + +#define dec_rs_pending(mdev) \ + typecheck(drbd_dev*,mdev); \ + atomic_dec(&mdev->rs_pending_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt) + +static inline void inc_unacked(drbd_dev* mdev) +{ + atomic_inc(&mdev->unacked_cnt); +} + +#if 0 && LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +/* + * idea was to forcefully push the tcp stack whenever the + * currently last pending packet is in the buffer. + * should be benchmarked on some real box to see if it has any + * effect on overall latency. + */ + +/* this only works with 2.6 kernels because of some conflicting defines + * in header files included from net.tcp.h. + */ + +#include +static inline void drbd_push_msock(drbd_dev* mdev) +{ + struct sock *sk; + struct tcp_opt *tp; + if (mdev->meta.socket == NULL) return; + sk = mdev->meta.socket->sk; + tp = tcp_sk(sk); + lock_sock(sk); + __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), TCP_NAGLE_PUSH); + release_sock(sk); +} + +#define dec_unacked(mdev) \ + might_sleep(); \ + typecheck(drbd_dev*,mdev); \ + if (atomic_dec_and_test(&mdev->unacked_cnt)) \ + drbd_push_msock(mdev); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); + +#define sub_unacked(mdev, n) \ + might_sleep(); \ + typecheck(drbd_dev*,mdev); \ + if (atomic_sub_and_test(n, &mdev->unacked_cnt)) \ + drbd_push_msock(mdev); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); +#else +#define dec_unacked(mdev) \ + typecheck(drbd_dev*,mdev); \ + atomic_dec(&mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt) + +#define sub_unacked(mdev, n) \ + typecheck(drbd_dev*,mdev); \ + atomic_sub(n, &mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt) +#endif + + +/** + * inc_local: Returns TRUE when local IO is possible. If it returns + * TRUE you should call dec_local() after IO is completed. + */ +static inline int inc_local(drbd_dev* mdev) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = !test_bit(DISKLESS,&mdev->flags); + if( !io_allowed ) { + atomic_dec(&mdev->local_cnt); + } + return io_allowed; +} + +static inline int inc_local_md_only(drbd_dev* mdev) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = !test_bit(DISKLESS,&mdev->flags) || + test_bit(MD_IO_ALLOWED,&mdev->flags); + if( !io_allowed ) { + atomic_dec(&mdev->local_cnt); + } + return io_allowed; +} + +static inline void dec_local(drbd_dev* mdev) +{ + if(atomic_dec_and_test(&mdev->local_cnt) && + test_bit(DISKLESS,&mdev->flags) && + mdev->lo_file) { + wake_up(&mdev->cstate_wait); + } + + D_ASSERT(atomic_read(&mdev->local_cnt)>=0); +} + +static inline void inc_ap_bio(drbd_dev* mdev) +{ + atomic_inc(&mdev->ap_bio_cnt); +} + +static inline void dec_ap_bio(drbd_dev* mdev) +{ + if(atomic_dec_and_test(&mdev->ap_bio_cnt)) + wake_up(&mdev->cstate_wait); + + D_ASSERT(atomic_read(&mdev->ap_bio_cnt)>=0); +} + +#ifdef DUMP_EACH_PACKET +/* + * enable to dump information about every packet exchange. + */ +#define INFOP(fmt, args...) \ + INFO("%s:%d: %s [%d] %s %s " fmt , \ + file, line, current->comm, current->pid, \ + sockname, recv?"<<<":">>>" \ + , ## args ) +static inline void +dump_packet(drbd_dev *mdev, struct socket *sock, + int recv, Drbd_Polymorph_Packet *p, char* file, int line) +{ + char *sockname = sock == mdev->meta.socket ? "meta" : "data"; + int cmd = (recv == 2) ? p->head.command : be16_to_cpu(p->head.command); + switch (cmd) { + case HandShake: + INFOP("%s (%u)\n", be32_to_cpu(p->HandShake.protocol_version)); + break; + + case Ping: + case PingAck: + case BecomeSyncTarget: + case BecomeSyncSource: + case UnplugRemote: + + case SyncParam: + case ReportParams: + INFOP("%s\n", cmdname(cmd)); + break; + + case ReportBitMap: /* don't report this */ + break; + + case Data: + case DataReply: + case RSDataReply: + + case RecvAck: /* yes I know. but it is the same layout */ + case WriteAck: + case NegAck: + + case DataRequest: + case RSDataRequest: + INFOP("%s (%lu,%llx)\n", cmdname(cmd), + (long)be64_to_cpu(p->Data.sector), (long long)p->Data.block_id + ); + break; + + case Barrier: + case BarrierAck: + INFOP("%s (%u)\n", cmdname(cmd), p->Barrier.barrier); + break; + + default: + INFOP("%s (%u)\n",cmdname(cmd), cmd); + break; + } +} +#else +#define dump_packet(ignored...) ((void)0) +#endif + + +#ifndef sector_div +# define sector_div(n, b)( \ +{ \ + int _res; \ + _res = (n) % (b); \ + (n) /= (b); \ + _res; \ +} \ +) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +// this is a direct copy from 2.6.6 include/linux/bitops.h + +static inline unsigned long generic_hweight64(u64 w) +{ +#if BITS_PER_LONG < 64 + return generic_hweight32((unsigned int)(w >> 32)) + + generic_hweight32((unsigned int)w); +#else + u64 res; + res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul); + res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul); + res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful); + res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul); + res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul); + return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul); +#endif +} + +static inline unsigned long hweight_long(unsigned long w) +{ + return sizeof(w) == 4 ? generic_hweight32(w) : generic_hweight64(w); +} +#endif + +static inline void drbd_suicide(void) +{ +#ifdef TASK_ZOMBIE + set_current_state(TASK_ZOMBIE); +#else + current->exit_state = EXIT_ZOMBIE; +#endif + schedule(); +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_main.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,2236 @@ +/* +-*- Linux-c -*- + drbd.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 1999-2004, Philipp Reisner . + main author. + + Copyright (C) 2002-2004, Lars Ellenberg . + main contributor. + + Copyright (C) 2000, Marcelo Tosatti . + Early 2.3.x work. + + Copyright (C) 2001, Lelik P.Korchagin . + Initial devfs support. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) || defined(HAVE_MM_INLINE_H) +#include +#endif +#include +#include + +#define __KERNEL_SYSCALLS__ +#include +#include + +#include +#include "drbd_int.h" + +/* YES. We got an official device major from lanana + */ +#define LANANA_DRBD_MAJOR 147 + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +# if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) || defined(CONFIG_X86_64) +extern int register_ioctl32_conversion(unsigned int cmd, + int (*handler)(unsigned int, + unsigned int, + unsigned long, + struct file *)); +extern int unregister_ioctl32_conversion(unsigned int cmd); +extern asmlinkage int sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); +# endif +#else +# ifdef CONFIG_COMPAT +# if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,10) + /* FIXME on which thing could we test instead of the KERNEL_VERSION + * again? register_ioctl32_conversion was deprecated in 2.6.10, got + * "officially" deprecated somewhen in 2.6.12, and removed in 2.6.14. + * so lets assume all vendor kernels did the transition. */ +# define HAVE_COMPAT_IOCTL_MEMBER +# else +# include +# endif +# endif +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +static devfs_handle_t devfs_handle; +#endif + +int drbdd_init(struct Drbd_thread*); +int drbd_worker(struct Drbd_thread*); +int drbd_asender(struct Drbd_thread*); + +int drbd_init(void); +STATIC int drbd_open(struct inode *inode, struct file *file); +STATIC int drbd_close(struct inode *inode, struct file *file); + +#ifdef DEVICE_REQUEST +#undef DEVICE_REQUEST +#endif +#define DEVICE_REQUEST drbd_do_request + +MODULE_AUTHOR("Philipp Reisner , Lars Ellenberg "); +MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(use_nbd_major, "DEPRECATED! use nbd device major nr (43) " + "instead of the default " __stringify(LANANA_DRBD_MAJOR) ); +MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +MODULE_PARM(use_nbd_major,"i"); +MODULE_PARM(minor_count,"i"); +#else +#include +MODULE_PARM_DESC(disable_bd_claim, "DONT USE! disables block device claiming" ); +/* + * please somebody explain to me what the "perm" of the module_param + * macro is good for (yes, permission for it in the "driverfs", but what + * do we need to do for them to show up, to begin with?) + * once I understand this, and the rest of the sysfs stuff, I probably + * be able to understand how we can move from our ioctl interface to a + * proper sysfs based one. + * -- lge + */ + +/* thanks to these macros, if compiled into the kernel (not-module), + * these become boot parameters: [-drbd.major_nr-], drbd.minor_count and + * drbd.disable_io_hints + */ +module_param(use_nbd_major, bool,0); +module_param(minor_count, int,0); +module_param(disable_bd_claim,bool,0); +#endif + +// module parameter, defined +int use_nbd_major = 0; +int major_nr = LANANA_DRBD_MAJOR; +#ifdef MODULE +int minor_count = 2; +#else +int minor_count = 8; +#endif +int disable_bd_claim = 0; + +// devfs name +char* drbd_devfs_name = "drbd"; + + +// global panic flag +volatile int drbd_did_panic = 0; + +/* in 2.6.x, our device mapping and config info contains our virtual gendisks + * as member "struct gendisk *vdisk;" + */ +NOT_IN_26( +STATIC int *drbd_blocksizes; +STATIC int *drbd_sizes; +) +struct Drbd_Conf *drbd_conf; +kmem_cache_t *drbd_request_cache; +kmem_cache_t *drbd_ee_cache; +mempool_t *drbd_request_mempool; + +STATIC struct block_device_operations drbd_ops = { +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,10) + .owner = THIS_MODULE, +#endif + .open = drbd_open, + .release = drbd_close, + .ioctl = drbd_ioctl, +#ifdef HAVE_COMPAT_IOCTL_MEMBER + .compat_ioctl = drbd_compat_ioctl, +#endif +}; + +#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) + +/************************* The transfer log start */ +STATIC int tl_init(drbd_dev *mdev) +{ + struct drbd_barrier *b; + + b=kmalloc(sizeof(struct drbd_barrier),GFP_KERNEL); + if(!b) return 0; + INIT_LIST_HEAD(&b->requests); + b->next=0; + b->br_number=4711; + b->n_req=0; + + mdev->oldest_barrier = b; + mdev->newest_barrier = b; + + return 1; +} + +STATIC void tl_cleanup(drbd_dev *mdev) +{ + D_ASSERT(mdev->oldest_barrier == mdev->newest_barrier); + kfree(mdev->oldest_barrier); +} + +STATIC void tl_add(drbd_dev *mdev, drbd_request_t * new_item) +{ + struct drbd_barrier *b; + + spin_lock_irq(&mdev->tl_lock); + + b=mdev->newest_barrier; + + new_item->barrier = b; + new_item->rq_status |= RQ_DRBD_IN_TL; + list_add(&new_item->w.list,&b->requests); + + if( b->n_req++ > mdev->conf.max_epoch_size ) { + set_bit(ISSUE_BARRIER,&mdev->flags); + } + + spin_unlock_irq(&mdev->tl_lock); +} + +STATIC void tl_cancel(drbd_dev *mdev, drbd_request_t * item) +{ + struct drbd_barrier *b; + + spin_lock_irq(&mdev->tl_lock); + + b=item->barrier; + b->n_req--; + + list_del(&item->w.list); + item->rq_status &= ~RQ_DRBD_IN_TL; + + spin_unlock_irq(&mdev->tl_lock); +} + +STATIC unsigned int tl_add_barrier(drbd_dev *mdev) +{ + unsigned int bnr; + static int barrier_nr_issue=1; + struct drbd_barrier *b; + + barrier_nr_issue++; + + b=kmalloc(sizeof(struct drbd_barrier),GFP_NOIO); + if(!b) { + ERR("could not kmalloc() barrier\n"); + return 0; + } + INIT_LIST_HEAD(&b->requests); + b->next=0; + b->br_number=barrier_nr_issue; + b->n_req=0; + + spin_lock_irq(&mdev->tl_lock); + + bnr = mdev->newest_barrier->br_number; + mdev->newest_barrier->next = b; + mdev->newest_barrier = b; + + spin_unlock_irq(&mdev->tl_lock); + + return bnr; +} + +void tl_release(drbd_dev *mdev,unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_barrier *b; + + spin_lock_irq(&mdev->tl_lock); + + b = mdev->oldest_barrier; + mdev->oldest_barrier = b->next; + + list_del(&b->requests); + /* There could be requests on the list waiting for completion + of the write to the local disk, to avoid corruptions of + slab's data structures we have to remove the lists head */ + + spin_unlock_irq(&mdev->tl_lock); + + D_ASSERT(b->br_number == barrier_nr); + D_ASSERT(b->n_req == set_size); + + kfree(b); +} + +/* tl_dependence reports if this sector was present in the current + epoch. + As side effect it clears also the pointer to the request if it + was present in the transfert log. (Since tl_dependence indicates + that IO is complete and that drbd_end_req() should not be called + in case tl_clear has to be called due to interruption of the + communication) +*/ +/* bool */ +int tl_dependence(drbd_dev *mdev, drbd_request_t * item) +{ + unsigned long flags; + int r=TRUE; + + spin_lock_irqsave(&mdev->tl_lock,flags); + + r = ( item->barrier == mdev->newest_barrier ); + list_del(&item->w.list); + + spin_unlock_irqrestore(&mdev->tl_lock,flags); + return r; +} + +void tl_clear(drbd_dev *mdev) +{ + struct list_head *le,*tle; + struct drbd_barrier *b,*f,*new_first; + struct drbd_request *r; + sector_t sector; + unsigned int size; + + new_first=kmalloc(sizeof(struct drbd_barrier),GFP_NOIO); + if(!new_first) { + ERR("could not kmalloc() barrier\n"); + } + + /* FIXME if indeed we could not kmalloc, this will Oops! + * can we somehow just recycle one of the existing barriers? + */ + INIT_LIST_HEAD(&new_first->requests); + new_first->next=0; + new_first->br_number=4711; + new_first->n_req=0; + + spin_lock_irq(&mdev->tl_lock); + + b=mdev->oldest_barrier; + mdev->oldest_barrier = new_first; + mdev->newest_barrier = new_first; + + spin_unlock_irq(&mdev->tl_lock); + + inc_ap_pending(mdev); // Since we count the old first as well... + + while ( b ) { + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request,w.list); + // bi_size and bi_sector are modified in bio_endio! + sector = drbd_req_get_sector(r); + size = drbd_req_get_size(r); + if( !(r->rq_status & RQ_DRBD_SENT) ) { + if(mdev->conf.wire_protocol != DRBD_PROT_A ) + dec_ap_pending(mdev); + drbd_end_req(r,RQ_DRBD_SENT,ERF_NOTLD|1, sector); + goto mark; + } + if(mdev->conf.wire_protocol != DRBD_PROT_C ) { + mark: + drbd_set_out_of_sync(mdev, sector, size); + } + } + f=b; + b=b->next; + list_del(&f->requests); + kfree(f); + dec_ap_pending(mdev); // for the barrier + } +} + +/** + * drbd_io_error: Handles the on_io_error setting, should be called in the + * unlikely(!drbd_bio_uptodate(e->bio)) case from kernel thread context. + * See also drbd_chk_io_error + * + * NOTE: we set ourselves DISKLESS here. + * But we try to write the "need full sync bit" here anyways. This is to make sure + * that you get a resynchronisation of the full device the next time you + * connect. + */ +int drbd_io_error(drbd_dev* mdev) +{ + int ok=1; + + if(mdev->on_io_error != Panic && mdev->on_io_error != Detach) return 1; + if(test_and_set_bit(SENT_DISK_FAILURE,&mdev->flags)) return 1; + + D_ASSERT(test_bit(DISKLESS,&mdev->flags)); + ok = drbd_send_param(mdev,0); + WARN("Notified peer that my disk is broken.\n"); + + D_ASSERT(drbd_md_test_flag(mdev,MDF_FullSync)); + D_ASSERT(!drbd_md_test_flag(mdev,MDF_Consistent)); + if (test_bit(MD_DIRTY,&mdev->flags)) { + // try to get "inconsistent, need full sync" to MD + drbd_md_write(mdev); + } + + if(mdev->cstate > Connected ) { + WARN("Resync aborted.\n"); + set_cstate(mdev,Connected); + mdev->rs_total = 0; + } + if ( wait_event_interruptible_timeout(mdev->cstate_wait, + atomic_read(&mdev->local_cnt) == 0 , HZ ) <= 0) { + WARN("Not releasing backing storage device.\n"); + /* FIXME if there *are* still references, + * we should be here again soon enough. + * but what if not? + * we still should free our ll and md devices */ + } else { + /* no race. since the DISKLESS bit is set first, + * further references to local_cnt are shortlived, + * and no real references on the device. */ + WARN("Releasing backing storage device.\n"); + drbd_free_ll_dev(mdev); + mdev->la_size=0; + } + + return ok; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,14) +// daemonize was no global symbol before 2.4.14 +/* in 2.4.6 is is prototyped as + * void daemonize(const char *name, ...) + * though, so maybe we want to do this for 2.4.x already, too. + */ +void daemonize(void) +{ + struct fs_struct *fs; + + exit_mm(current); + + current->session = 1; + current->pgrp = 1; + current->tty = NULL; + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); +} +#endif + +STATIC void drbd_daemonize(void) { +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0) + daemonize("drbd_thread"); +#else + daemonize(); + // VERIFY what about blocking signals ? + reparent_to_init(); +#endif +} + +void _set_cstate(drbd_dev* mdev,Drbd_CState ns) +{ + Drbd_CState os; + + os = mdev->cstate; + +#if DUMP_MD >= 2 + INFO("%s [%d]: cstate %s --> %s\n", current->comm, current->pid, + cstate_to_name(os), cstate_to_name(ns) ); +#endif + + mdev->cstate = ns; + smp_mb(); + wake_up(&mdev->cstate_wait); + + /* THINK. + * was: + * if ( ( os==SyncSource || os==SyncTarget ) && ns <= Connected ) { + */ + if ( ( os >= SyncSource ) && ns <= Connected ) { + clear_bit(SYNC_STARTED,&mdev->flags); + set_bit(STOP_SYNC_TIMER,&mdev->flags); + mod_timer(&mdev->resync_timer,jiffies); + } + if(test_bit(MD_IO_ALLOWED,&mdev->flags) && + test_bit(DISKLESS,&mdev->flags) && ns < Connected) { +// FIXME EXPLAIN + clear_bit(MD_IO_ALLOWED,&mdev->flags); + } +} + +STATIC int drbd_thread_setup(void* arg) +{ + struct Drbd_thread *thi = (struct Drbd_thread *) arg; + drbd_dev *mdev = thi->mdev; + int retval; + + drbd_daemonize(); + D_ASSERT(get_t_state(thi) == Running); + D_ASSERT(thi->task == NULL); + spin_lock(&thi->t_lock); + thi->task = current; + smp_mb(); + spin_unlock(&thi->t_lock); + complete(&thi->startstop); // notify: thi->task is set. + + retval = thi->function(thi); + + spin_lock(&thi->t_lock); + thi->task = 0; + thi->t_state = Exiting; + smp_mb(); + spin_unlock(&thi->t_lock); + + // THINK maybe two different completions? + complete(&thi->startstop); // notify: thi->task unset. + + return retval; +} + +STATIC void drbd_thread_init(drbd_dev *mdev, struct Drbd_thread *thi, + int (*func) (struct Drbd_thread *)) +{ + thi->t_lock = SPIN_LOCK_UNLOCKED; + thi->task = NULL; + thi->t_state = None; + init_completion(&thi->startstop); + + thi->function = func; + thi->mdev = mdev; +} + +void drbd_thread_start(struct Drbd_thread *thi) +{ + int pid; + drbd_dev *mdev = thi->mdev; + + spin_lock(&thi->t_lock); + + /* INFO("%s [%d]: %s %d -> Running\n", + current->comm, current->pid, + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE", + thi->t_state); */ + + if (thi->t_state == None) { + D_ASSERT(thi->task == NULL); + thi->t_state = Running; + spin_unlock(&thi->t_lock); + + pid = kernel_thread(drbd_thread_setup, (void *) thi, CLONE_FS); + if (pid < 0) { + ERR("Couldn't start thread (%d)\n", pid); + return; + } + wait_for_completion(&thi->startstop); // waits until thi->task is set + D_ASSERT(thi->task); + D_ASSERT(get_t_state(thi) == Running); + } else { + spin_unlock(&thi->t_lock); + } +} + + +void _drbd_thread_stop(struct Drbd_thread *thi, int restart,int wait) +{ + drbd_dev *mdev = thi->mdev; + Drbd_thread_state ns = restart ? Restarting : Exiting; + + spin_lock(&thi->t_lock); + + /* INFO("%s [%d]: %s %d -> %d; %d\n", + current->comm, current->pid, + thi->task ? thi->task->comm : "NULL", thi->t_state, ns, wait); */ + + + if (thi->t_state == None) { + spin_unlock(&thi->t_lock); + return; + } + + if (thi->t_state != ns) { + ERR_IF (thi->task == NULL) { + spin_unlock(&thi->t_lock); + return; + } + + if (ns == Restarting && thi->t_state == Exiting) { + // Already Exiting. Cannot restart! + spin_unlock(&thi->t_lock); + return; + } + + thi->t_state = ns; + smp_mb(); + if (thi->task != current) + force_sig(DRBD_SIGKILL,thi->task); + else + D_ASSERT(!wait); + + } + spin_unlock(&thi->t_lock); + + if (wait) { + D_ASSERT(thi->t_state == Exiting); + wait_for_completion(&thi->startstop); + spin_lock(&thi->t_lock); + thi->t_state = None; + smp_mb(); + D_ASSERT(thi->task == NULL); + spin_unlock(&thi->t_lock); + } +} + +inline sigset_t drbd_block_all_signals(void) +{ + unsigned long flags; + sigset_t oldset; + LOCK_SIGMASK(current,flags); + oldset = current->blocked; + sigfillset(¤t->blocked); + RECALC_SIGPENDING(); + UNLOCK_SIGMASK(current,flags); + return oldset; +} + +inline void restore_old_sigset(sigset_t oldset) +{ + unsigned long flags; + LOCK_SIGMASK(current,flags); + // _never_ propagate this to anywhere... + sigdelset(¤t->pending.signal, DRBD_SIG); + current->blocked = oldset; + RECALC_SIGPENDING(); + UNLOCK_SIGMASK(current,flags); +} + +int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header *h, + size_t size, unsigned msg_flags) +{ + int sent,ok; + + ERR_IF(!h) return FALSE; + ERR_IF(!size) return FALSE; + + h->magic = BE_DRBD_MAGIC; + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size-sizeof(Drbd_Header)); + + dump_packet(mdev,sock,0,(void*)h, __FILE__, __LINE__); + sent = drbd_send(mdev,sock,h,size,msg_flags); + + ok = ( sent == size ); + if(!ok) { + ERR("short sent %s size=%d sent=%d\n", + cmdname(cmd), (int)size, sent); + } + return ok; +} + +int drbd_send_cmd(drbd_dev *mdev, struct socket *sock, + Drbd_Packet_Cmd cmd, Drbd_Header* h, size_t size) +{ + int ok; + sigset_t old_blocked; + + if (sock == mdev->data.socket) { + down(&mdev->data.mutex); + spin_lock(&mdev->send_task_lock); + mdev->send_task=current; + spin_unlock(&mdev->send_task_lock); + } else + down(&mdev->meta.mutex); + + old_blocked = drbd_block_all_signals(); + ok = _drbd_send_cmd(mdev,sock,cmd,h,size,0); + restore_old_sigset(old_blocked); + + if (sock == mdev->data.socket) { + spin_lock(&mdev->send_task_lock); + mdev->send_task=NULL; + spin_unlock(&mdev->send_task_lock); + up(&mdev->data.mutex); + } else + up(&mdev->meta.mutex); + return ok; +} + +int drbd_send_sync_param(drbd_dev *mdev, struct syncer_config *sc) +{ + Drbd_SyncParam_Packet p; + int ok; + + p.rate = cpu_to_be32(sc->rate); + p.use_csums = cpu_to_be32(sc->use_csums); + p.skip = cpu_to_be32(sc->skip); + p.group = cpu_to_be32(sc->group); + + ok = drbd_send_cmd(mdev,mdev->data.socket,SyncParam,(Drbd_Header*)&p,sizeof(p)); + if ( ok + && (mdev->cstate == SkippedSyncS || mdev->cstate == SkippedSyncT) + && !sc->skip ) + { + /* FIXME EXPLAIN. I think this cannot work properly! -lge */ + set_cstate(mdev,WFReportParams); + ok = drbd_send_param(mdev,0); + } + return ok; +} + +int drbd_send_param(drbd_dev *mdev, int flags) +{ + Drbd_Parameter_Packet p; + int i, ok, have_disk; + unsigned long m_size; // sector_t ?? + + have_disk=inc_local(mdev); + if(have_disk) { + D_ASSERT(mdev->backing_bdev); + if (mdev->md_index == -1 ) m_size = drbd_md_ss(mdev)>>1; + else m_size = drbd_get_capacity(mdev->backing_bdev)>>1; + } else m_size = 0; + + p.u_size = cpu_to_be64(mdev->lo_usize); + p.p_size = cpu_to_be64(m_size); + + p.state = cpu_to_be32(mdev->state); + p.protocol = cpu_to_be32(mdev->conf.wire_protocol); + p.version = cpu_to_be32(PRO_VERSION); + + for (i = Flags; i < GEN_CNT_SIZE; i++) { + p.gen_cnt[i] = cpu_to_be32(mdev->gen_cnt[i]); + } + p.sync_rate = cpu_to_be32(mdev->sync_conf.rate); + p.sync_use_csums = cpu_to_be32(mdev->sync_conf.use_csums); + p.skip_sync = cpu_to_be32(mdev->sync_conf.skip); + p.sync_group = cpu_to_be32(mdev->sync_conf.group); + p.flags = cpu_to_be32(flags); + p.magic = BE_DRBD_MAGIC; + + ok = drbd_send_cmd(mdev,mdev->data.socket,ReportParams,(Drbd_Header*)&p,sizeof(p)); + if (have_disk) dec_local(mdev); + return ok; +} + +/* See the comment at receive_bitmap() */ +int _drbd_send_bitmap(drbd_dev *mdev) +{ + int want; + int ok=TRUE, bm_i=0; + size_t bm_words, num_words; + unsigned long *buffer; + Drbd_Header *p; + + ERR_IF(!mdev->bitmap) return FALSE; + + bm_words = drbd_bm_words(mdev); + p = vmalloc(PAGE_SIZE); // sleeps. cannot fail. + buffer = (unsigned long*)p->payload; + + if (drbd_md_test_flag(mdev,MDF_FullSync)) { + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + if (unlikely(test_bit(DISKLESS,&mdev->flags))) { + /* write_bm did fail! panic. + * FIXME can we do something better than panic? + */ + drbd_panic("Failed to write bitmap to disk\n!"); + ok = FALSE; + goto out; + } + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_write(mdev); + } + + /* + * maybe TODO use some simple compression scheme, nowadays there are + * some such algorithms in the kernel anyways. + */ + do { + num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i ); + want = num_words * sizeof(long); + if (want) { + drbd_bm_get_lel(mdev, bm_i, num_words, buffer); + } + ok = _drbd_send_cmd(mdev,mdev->data.socket,ReportBitMap, + p, sizeof(*p) + want, 0); + bm_i += num_words; + } while (ok && want); + + out: + vfree(p); + return ok; +} + +int drbd_send_bitmap(drbd_dev *mdev) +{ + int ok; + down(&mdev->data.mutex); + ok=_drbd_send_bitmap(mdev); + up(&mdev->data.mutex); + return ok; +} + +int _drbd_send_barrier(drbd_dev *mdev) +{ + int ok; + Drbd_Barrier_Packet p; + + /* printk(KERN_DEBUG DEVICE_NAME": issuing a barrier\n"); */ + /* tl_add_barrier() must be called with the sock_mutex aquired */ + p.barrier=tl_add_barrier(mdev); + + inc_ap_pending(mdev); + ok = _drbd_send_cmd(mdev,mdev->data.socket,Barrier,(Drbd_Header*)&p,sizeof(p),0); + +// if (!ok) dec_ap_pending(mdev); // is done in tl_clear() + return ok; +} + +int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,u32 set_size) +{ + int ok; + Drbd_BarrierAck_Packet p; + + p.barrier = barrier_nr; + p.set_size = cpu_to_be32(set_size); + + ok = drbd_send_cmd(mdev,mdev->meta.socket,BarrierAck,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + + +int drbd_send_ack(drbd_dev *mdev, Drbd_Packet_Cmd cmd, struct Tl_epoch_entry *e) +{ + int ok; + Drbd_BlockAck_Packet p; + + p.sector = cpu_to_be64(drbd_ee_get_sector(e)); + p.block_id = e->block_id; + p.blksize = cpu_to_be32(drbd_ee_get_size(e)); + + if (!mdev->meta.socket || mdev->cstate < Connected) return FALSE; + ok = drbd_send_cmd(mdev,mdev->meta.socket,cmd,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +int drbd_send_drequest(drbd_dev *mdev, int cmd, + sector_t sector,int size, u64 block_id) +{ + int ok; + Drbd_BlockRequest_Packet p; + + p.sector = cpu_to_be64(sector); + p.block_id = block_id; + p.blksize = cpu_to_be32(size); + + ok = drbd_send_cmd(mdev,mdev->data.socket,cmd,(Drbd_Header*)&p,sizeof(p)); + return ok; +} + +/* called on sndtimeo + * returns FALSE if we should retry, + * TRUE if we think connection is dead + */ +STATIC int we_should_drop_the_connection(drbd_dev *mdev, struct socket *sock) +{ + int drop_it; + // long elapsed = (long)(jiffies - mdev->last_received); + // DUMPLU(elapsed); // elapsed ignored for now. + + drop_it = mdev->meta.socket == sock + || !mdev->asender.task + || get_t_state(&mdev->asender) != Running + || (volatile int)mdev->cstate < Connected; + + if (drop_it) + return TRUE; + + drop_it = !--mdev->ko_count; + if ( !drop_it ) { + ERR("[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, mdev->ko_count); + request_ping(mdev); + } + + return drop_it; /* && (mdev->state == Primary) */; +} + +/* The idea of sendpage seems to be to put some kind of reference + to the page into the skb, and to hand it over to the NIC. In + this process get_page() gets called. + + As soon as the page was really sent over the network put_page() + gets called by some part of the network layer. [ NIC driver? ] + + [ get_page() / put_page() increment/decrement the count. If count + reaches 0 the page will be freed. ] + + This works nicely with pages from FSs. + But this means that in protocol A we might signal IO completion too early ! + + In order not to corrupt data during a resync we must make sure + that we do not reuse our own buffer pages (EEs) to early, therefore + we have the net_ee list. + + XFS seems to have problems, still, it submits pages with page_count == 0! + As a workaround, we disable sendpage on pages with page_count == 0 or PageSlab. +*/ +int _drbd_no_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size) +{ + int ret; + ret = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + return ret; +} + +#ifdef DRBD_DISABLE_SENDPAGE +int _drbd_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size) +{ + int sent,ok; + int len = size; + + spin_lock(&mdev->send_task_lock); + mdev->send_task=current; + spin_unlock(&mdev->send_task_lock); + + sent = _drbd_no_send_page(mdev, page, offset, size); + if (likely(sent > 0)) len -= sent; + + spin_lock(&mdev->send_task_lock); + mdev->send_task=NULL; + spin_unlock(&mdev->send_task_lock); + + ok = (len == 0); + if (likely(ok)) + mdev->send_cnt += size>>9; + return ok; +} +#else +int _drbd_send_page(drbd_dev *mdev, struct page *page, + int offset, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int sent,ok; + int len = size; + +#ifdef SHOW_SENDPAGE_USAGE + unsigned long now = jiffies; + static unsigned long total = 0; + static unsigned long fallback = 0; + static unsigned long last_rep = 0; + + /* report statistics every hour, + * if we had at least one fallback. + */ + ++total; + if (fallback && time_before(last_rep+3600*HZ, now)) { + last_rep = now; + printk(KERN_INFO DEVICE_NAME + ": sendpage() omitted: %lu/%lu\n", fallback, total); + } +#endif + + + spin_lock(&mdev->send_task_lock); + mdev->send_task=current; + spin_unlock(&mdev->send_task_lock); + + /* PARANOIA. if this ever triggers, + * something in the layers above us is really kaputt. + *one roundtrip later: + * doh. it triggered. so XFS _IS_ really kaputt ... + * oh well... + */ + if ( (page_count(page) < 1) || PageSlab(page) ) { + /* e.g. XFS meta- & log-data is in slab pages, which have a + * page_count of 0 and/or have PageSlab() set... + */ +#ifdef SHOW_SENDPAGE_USAGE + ++fallback; +#endif + sent = _drbd_no_send_page(mdev, page, offset, size); + if (likely(sent > 0)) len -= sent; + goto out; + } + + set_fs(KERNEL_DS); + do { + sent = mdev->data.socket->ops->sendpage(mdev->data.socket,page, + offset,len, + MSG_NOSIGNAL); + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(mdev, + mdev->data.socket)) + break; + else + continue; + } + if (sent <= 0) { + WARN("%s: size=%d len=%d sent=%d\n", + __func__,(int)size,len,sent); + break; + } + len -= sent; + offset += sent; + // FIXME test "last_received" ... + } while(len > 0 /* THINK && mdev->cstate >= Connected*/); + set_fs(oldfs); + + out: + spin_lock(&mdev->send_task_lock); + mdev->send_task=NULL; + spin_unlock(&mdev->send_task_lock); + + ok = (len == 0); + if (likely(ok)) + mdev->send_cnt += size>>9; + return ok; +} +#endif + +// Used to send write requests: bh->b_rsector !! +int drbd_send_dblock(drbd_dev *mdev, drbd_request_t *req) +{ + int ok=1; + sigset_t old_blocked; + Drbd_Data_Packet p; + + ERR_IF(!req || !req->master_bio) return FALSE; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(Data); + p.head.length = cpu_to_be16( sizeof(p)-sizeof(Drbd_Header) + + drbd_req_get_size(req) ); + + p.sector = cpu_to_be64(drbd_req_get_sector(req)); + p.block_id = (unsigned long)req; + + /* About tl_add(): + 1. This must be within the semaphor, + to ensure right order in tl_ data structure and to + ensure right order of packets on the write + 2. This must happen before sending, otherwise we might + get in the BlockAck packet before we have it on the + tl_ datastructure (=> We would want to remove it before it + is there!) + 3. Q: Why can we add it to tl_ even when drbd_send() might fail ? + There could be a tl_cancel() to remove it within the semaphore! + A: If drbd_send fails, we will loose the connection. Then + tl_cear() will simulate a RQ_DRBD_SEND and set it out of sync + for everything in the data structure. + */ + + /* Still called directly by drbd_make_request, + * so all sorts of processes may end up here. + * They may be interrupted by DRBD_SIG in response to + * ioctl or some other "connection lost" event. + * This is not propagated. + */ + + old_blocked = drbd_block_all_signals(); + down(&mdev->data.mutex); + spin_lock(&mdev->send_task_lock); + mdev->send_task=current; + spin_unlock(&mdev->send_task_lock); + + if(test_and_clear_bit(ISSUE_BARRIER,&mdev->flags)) + ok = _drbd_send_barrier(mdev); + if(ok) { + tl_add(mdev,req); + dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__); + set_bit(UNPLUG_REMOTE,&mdev->flags); + ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE); + if(ok) { + if(mdev->conf.wire_protocol == DRBD_PROT_A) { + ok = _drbd_send_bio(mdev,drbd_req_private_bio(req)); + } else { + ok = _drbd_send_zc_bio(mdev,drbd_req_private_bio(req)); + } + } + if(!ok) tl_cancel(mdev,req); + } + if (!ok) { + drbd_set_out_of_sync(mdev, + drbd_req_get_sector(req), + drbd_req_get_size(req)); + drbd_end_req(req,RQ_DRBD_SENT,ERF_NOTLD|1, + drbd_req_get_sector(req)); + } + spin_lock(&mdev->send_task_lock); + mdev->send_task=NULL; + spin_unlock(&mdev->send_task_lock); + + up(&mdev->data.mutex); + restore_old_sigset(old_blocked); + return ok; +} + +int drbd_send_block(drbd_dev *mdev, Drbd_Packet_Cmd cmd, + struct Tl_epoch_entry *e) +{ + int ok; + sigset_t old_blocked; + Drbd_Data_Packet p; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = cpu_to_be16( sizeof(p)-sizeof(Drbd_Header) + + drbd_ee_get_size(e) ); + + p.sector = cpu_to_be64(drbd_ee_get_sector(e)); + p.block_id = e->block_id; + + /* Only called by our kernel thread. + * This one may be interupted by DRBD_SIG and/or DRBD_SIGKILL + * in response to ioctl or module unload. + */ + old_blocked = drbd_block_all_signals(); + down(&mdev->data.mutex); + spin_lock(&mdev->send_task_lock); + mdev->send_task=current; + spin_unlock(&mdev->send_task_lock); + + dump_packet(mdev,mdev->data.socket,0,(void*)&p, __FILE__, __LINE__); + ok = sizeof(p) == drbd_send(mdev,mdev->data.socket,&p,sizeof(p),MSG_MORE); + if (ok) ok = _drbd_send_zc_bio(mdev,&e->private_bio); + + spin_lock(&mdev->send_task_lock); + mdev->send_task=NULL; + spin_unlock(&mdev->send_task_lock); + up(&mdev->data.mutex); + restore_old_sigset(old_blocked); + return ok; +} + +/* + drbd_send distinguishes two cases: + + Packets sent via the data socket "sock" + and packets sent via the meta data socket "msock" + + sock msock + -----------------+-------------------------+------------------------------ + timeout conf.timeout / 2 conf.timeout / 2 + timeout action send a ping via msock Abort communication + and close all sockets +*/ + +/* + * you should have down()ed the appropriate [m]sock_mutex elsewhere! + */ +int drbd_send(drbd_dev *mdev, struct socket *sock, + void* buf, size_t size, unsigned msg_flags) +{ +#if !HAVE_KERNEL_SENDMSG + mm_segment_t oldfs; + struct iovec iov; +#else + struct kvec iov; +#endif + struct msghdr msg; + int rv,sent=0; + + if (!sock) return -1000; + if ((volatile int)mdev->cstate < WFReportParams) return -1001; + + // THINK if (signal_pending) return ... ? + + iov.iov_base = buf; + iov.iov_len = size; + + msg.msg_name = 0; + msg.msg_namelen = 0; +#if !HAVE_KERNEL_SENDMSG + msg.msg_iov = &iov; + msg.msg_iovlen = 1; +#endif + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + +#if !HAVE_KERNEL_SENDMSG + oldfs = get_fs(); + set_fs(KERNEL_DS); +#endif + + if (sock == mdev->data.socket) + mdev->ko_count = mdev->conf.ko_count; + do { + /* STRANGE + * tcp_sendmsg does _not_ use its size parameter at all ? + * + * -EAGAIN on timeout, -EINTR on signal. + */ +/* THINK + * do we need to block DRBD_SIG if sock == &meta.socket ?? + * otherwise wake_asender() might interrupt some send_*Ack ! + */ +#if !HAVE_KERNEL_SENDMSG + rv = sock_sendmsg(sock, &msg, iov.iov_len ); +#else + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); +#endif + if (rv == -EAGAIN) { + if (we_should_drop_the_connection(mdev,sock)) + break; + else + continue; + } + D_ASSERT(rv != 0); + if (rv == -EINTR ) { +#if 0 + /* FIXME this happens all the time. + * we don't care for now! + * eventually this should be sorted out be the proper + * use of the SIGNAL_ASENDER bit... */ + if (DRBD_ratelimit(5*HZ,5)) { + DBG("Got a signal in drbd_send(,%c,)!\n", + sock == mdev->meta.socket ? 'm' : 's'); + // dump_stack(); + } +#endif + drbd_flush_signals(current); + rv = 0; + } + if (rv < 0) break; + sent += rv; + iov.iov_base += rv; + iov.iov_len -= rv; + } while(sent < size); + +#if !HAVE_KERNEL_SENDMSG + set_fs(oldfs); +#endif + + if (rv <= 0) { + if (rv != -EAGAIN) { + ERR("%s_sendmsg returned %d\n", + sock == mdev->meta.socket ? "msock" : "sock", + rv); + set_cstate(mdev, BrokenPipe); + } else + set_cstate(mdev, Timeout); + drbd_thread_restart_nowait(&mdev->receiver); + } + + return sent; +} + +STATIC int drbd_open(struct inode *inode, struct file *file) +{ + int minor; + + minor = MINOR(inode->i_rdev); + if(minor >= minor_count) return -ENODEV; + + if (file->f_mode & FMODE_WRITE) { + if( drbd_conf[minor].state == Secondary) { + return -EROFS; + } + set_bit(WRITER_PRESENT, &drbd_conf[minor].flags); + } + + drbd_conf[minor].open_cnt++; + + NOT_IN_26(MOD_INC_USE_COUNT;) + + return 0; +} + +STATIC int drbd_close(struct inode *inode, struct file *file) +{ + /* do not use *file (May be NULL, in case of a unmount :-) */ + int minor; + + minor = MINOR(inode->i_rdev); + if(minor >= minor_count) return -ENODEV; + + /* + printk(KERN_ERR DEVICE_NAME ": close(inode=%p,file=%p)" + "current=%p,minor=%d,wc=%d\n", inode, file, current, minor, + inode->i_writecount); + */ + + if (--drbd_conf[minor].open_cnt == 0) { + clear_bit(WRITER_PRESENT, &drbd_conf[minor].flags); + } + + NOT_IN_26(MOD_DEC_USE_COUNT;) + + return 0; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +STATIC void drbd_unplug_fn(void *data) +{ + struct Drbd_Conf* mdev = (drbd_dev*)data; + spin_lock_irq(&mdev->req_lock); + if (list_empty(&mdev->unplug_work.list)) + _drbd_queue_work_front(&mdev->data.work,&mdev->unplug_work); + spin_unlock_irq(&mdev->req_lock); +} +#else + +STATIC void drbd_unplug_fn(request_queue_t *q) +{ + drbd_dev *mdev = q->queuedata; + + /* unplug FIRST */ + spin_lock_irq(q->queue_lock); + blk_remove_plug(q); + spin_unlock_irq(q->queue_lock); + + /* only if connected */ + if (mdev->cstate >= Connected && !test_bit(PARTNER_DISKLESS,&mdev->flags)) { + D_ASSERT(mdev->state == Primary); + if (test_and_clear_bit(UNPLUG_REMOTE,&mdev->flags)) { + spin_lock_irq(&mdev->req_lock); + /* add to the front of the data.work queue, + * unless already queued. + * XXX this might be a good addition to drbd_queue_work + * anyways, to detect "double queuing" ... */ + if (list_empty(&mdev->unplug_work.list)) + _drbd_queue_work_front(&mdev->data.work,&mdev->unplug_work); + spin_unlock_irq(&mdev->req_lock); + } + } + + if(!test_bit(DISKLESS,&mdev->flags)) drbd_kick_lo(mdev); +} +#endif + +void drbd_set_defaults(drbd_dev *mdev) +{ + mdev->flags = 1<sync_conf.rate = 250; + mdev->sync_conf.al_extents = 127; // 512 MB active set + mdev->state = Secondary; + mdev->o_state = Unknown; + mdev->cstate = Unconfigured; +} + +void drbd_init_set_defaults(drbd_dev *mdev) +{ + // the memset(,0,) did most of this + // note: only assignments, no allocation in here + +#ifdef PARANOIA + SET_MDEV_MAGIC(mdev); +#endif + + drbd_set_defaults(mdev); + + atomic_set(&mdev->ap_bio_cnt,0); + atomic_set(&mdev->ap_pending_cnt,0); + atomic_set(&mdev->rs_pending_cnt,0); + atomic_set(&mdev->unacked_cnt,0); + atomic_set(&mdev->local_cnt,0); + atomic_set(&mdev->resync_locked,0); + + init_MUTEX(&mdev->md_io_mutex); + init_MUTEX(&mdev->data.mutex); + init_MUTEX(&mdev->meta.mutex); + sema_init(&mdev->data.work.s,0); + sema_init(&mdev->meta.work.s,0); + + mdev->al_lock = SPIN_LOCK_UNLOCKED; + mdev->tl_lock = SPIN_LOCK_UNLOCKED; + mdev->ee_lock = SPIN_LOCK_UNLOCKED; + mdev->req_lock = SPIN_LOCK_UNLOCKED; + mdev->pr_lock = SPIN_LOCK_UNLOCKED; + mdev->send_task_lock = SPIN_LOCK_UNLOCKED; + + INIT_LIST_HEAD(&mdev->free_ee); + INIT_LIST_HEAD(&mdev->active_ee); + INIT_LIST_HEAD(&mdev->sync_ee); + INIT_LIST_HEAD(&mdev->done_ee); + INIT_LIST_HEAD(&mdev->read_ee); + INIT_LIST_HEAD(&mdev->net_ee); + INIT_LIST_HEAD(&mdev->app_reads); + INIT_LIST_HEAD(&mdev->resync_reads); + INIT_LIST_HEAD(&mdev->data.work.q); + INIT_LIST_HEAD(&mdev->meta.work.q); + INIT_LIST_HEAD(&mdev->resync_work.list); + INIT_LIST_HEAD(&mdev->barrier_work.list); + INIT_LIST_HEAD(&mdev->unplug_work.list); + mdev->resync_work.cb = w_resync_inactive; + mdev->barrier_work.cb = w_try_send_barrier; + mdev->unplug_work.cb = w_send_write_hint; + init_timer(&mdev->resync_timer); + mdev->resync_timer.function = resync_timer_fn; + mdev->resync_timer.data = (unsigned long) mdev; + + init_waitqueue_head(&mdev->cstate_wait); + init_waitqueue_head(&mdev->ee_wait); + init_waitqueue_head(&mdev->al_wait); + + drbd_thread_init(mdev, &mdev->receiver, drbdd_init); + drbd_thread_init(mdev, &mdev->worker, drbd_worker); + drbd_thread_init(mdev, &mdev->asender, drbd_asender); + +NOT_IN_26( + mdev->write_hint_tq.routine = &drbd_unplug_fn; + mdev->write_hint_tq.data = mdev; +) + +#ifdef __arch_um__ + INFO("mdev = 0x%p\n",mdev); +#endif +} + +void drbd_mdev_cleanup(drbd_dev *mdev) +{ + /* I'd like to cleanup completely, and memset(,0,) it. + * but I'd have to reinit it. + * FIXME: do the right thing... + */ + + /* list of things that may still + * hold data of the previous config + + * act_log ** re-initialized in set_disk + * on_io_error + + * al_tr_cycle ** re-initialized in ... FIXME?? + * al_tr_number + * al_tr_pos + + * backing_bdev ** re-initialized in drbd_free_ll_dev + * lo_file + * md_bdev + * md_file + * md_index + + * ko_count ** re-initialized in set_net + + * last_received ** currently ignored + + * mbds_id ** re-initialized in ... FIXME?? + + * resync ** re-initialized in ... FIXME?? + + *** no re-init necessary (?) *** + * md_io_page + * this_bdev + + * vdisk ? + + * rq_queue ** FIXME ASSERT ?? + * newest_barrier + * oldest_barrier + */ + + drbd_thread_stop(&mdev->worker); + + if ( mdev->ee_in_use != 0 + || mdev->ee_vacant != 32 /* EE_MININUM */ + || atomic_read(&mdev->epoch_size) != 0) + ERR("ee_in_use:%d ee_vacant:%d epoch_size:%d\n", + mdev->ee_in_use, mdev->ee_vacant, atomic_read(&mdev->epoch_size)); +#define ZAP(x) memset(&x,0,sizeof(x)) + ZAP(mdev->conf); + ZAP(mdev->sync_conf); + // ZAP(mdev->data); Not yet! + // ZAP(mdev->meta); Not yet! + ZAP(mdev->gen_cnt); +#undef ZAP + mdev->al_writ_cnt = + mdev->bm_writ_cnt = + mdev->read_cnt = + mdev->recv_cnt = + mdev->send_cnt = + mdev->writ_cnt = + mdev->la_size = + mdev->lo_usize = + mdev->p_size = + mdev->rs_start = + mdev->rs_total = + mdev->rs_mark_left = + mdev->rs_mark_time = 0; + mdev->send_task = NULL; + drbd_set_my_capacity(mdev,0); + drbd_bm_resize(mdev,0); + + // just in case + drbd_free_resources(mdev); + + /* + * currently we drbd_init_ee only on module load, so + * we may do drbd_release_ee only on module unload! + * drbd_release_ee(&mdev->free_ee); + * D_ASSERT(list_emptry(&mdev->free_ee)); + * + */ + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->net_ee)); + D_ASSERT(list_empty(&mdev->app_reads)); + D_ASSERT(list_empty(&mdev->resync_reads)); + D_ASSERT(list_empty(&mdev->data.work.q)); + D_ASSERT(list_empty(&mdev->meta.work.q)); + D_ASSERT(list_empty(&mdev->resync_work.list)); + D_ASSERT(list_empty(&mdev->barrier_work.list)); + D_ASSERT(list_empty(&mdev->unplug_work.list)); + + drbd_set_defaults(mdev); +} + + +void drbd_destroy_mempools(void) +{ + if (drbd_request_mempool) + mempool_destroy(drbd_request_mempool); + if (drbd_ee_cache && kmem_cache_destroy(drbd_ee_cache)) + printk(KERN_ERR DEVICE_NAME + ": kmem_cache_destroy(drbd_ee_cache) FAILED\n"); + if (drbd_request_cache && kmem_cache_destroy(drbd_request_cache)) + printk(KERN_ERR DEVICE_NAME + ": kmem_cache_destroy(drbd_request_cache) FAILED\n"); + // FIXME what can we do if we fail to destroy them? + + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + + return; +} + +int drbd_create_mempools(void) +{ + // prepare our caches and mempools + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + + // caches + drbd_request_cache = kmem_cache_create( + "drbd_req_cache", sizeof(drbd_request_t), + 0, 0, NULL, NULL); + if (drbd_request_cache == NULL) + goto Enomem; + + drbd_ee_cache = kmem_cache_create( + "drbd_ee_cache", sizeof(struct Tl_epoch_entry), + 0, 0, NULL, NULL); + if (drbd_ee_cache == NULL) + goto Enomem; + + // mempools + drbd_request_mempool = mempool_create(16, //TODO; reasonable value + mempool_alloc_slab, mempool_free_slab, drbd_request_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + return 0; + + Enomem: + drbd_destroy_mempools(); // in case we allocated some + return -ENOMEM; +} + +static void __exit drbd_cleanup(void) +{ + int i, rr; + + if (drbd_conf) { + for (i = 0; i < minor_count; i++) { + drbd_dev *mdev = drbd_conf + i; + + if (mdev) { + down(&mdev->device_mutex); + drbd_set_state(mdev,Secondary); + up(&mdev->device_mutex); + drbd_sync_me(mdev); + set_bit(DO_NOT_INC_CONCNT,&mdev->flags); + drbd_thread_stop(&mdev->receiver); + drbd_thread_stop(&mdev->worker); + } + } + + if (drbd_proc) + remove_proc_entry("drbd",&proc_root); + i=minor_count; + while (i--) { + drbd_dev *mdev = drbd_conf+i; +ONLY_IN_26( + struct gendisk **disk = &mdev->vdisk; + request_queue_t **q = &mdev->rq_queue; +) + + drbd_free_resources(mdev); + +ONLY_IN_26( + if (*disk) { + del_gendisk(*disk); + put_disk(*disk); + *disk = NULL; + } + if (*q) blk_put_queue(*q); + *q = NULL; + + if (mdev->this_bdev->bd_holder == drbd_sec_holder) { + mdev->this_bdev->bd_contains = mdev->this_bdev; + bd_release(mdev->this_bdev); + } + if (mdev->this_bdev) bdput(mdev->this_bdev); +) + + tl_cleanup(mdev); + if (mdev->bitmap) drbd_bm_cleanup(mdev); + if (mdev->resync) lc_free(mdev->resync); + + D_ASSERT(mdev->ee_in_use==0); + + rr = drbd_release_ee(mdev,&mdev->free_ee); + // INFO("%d EEs in free list found.\n",rr); + // D_ASSERT(rr == 32); + + rr = drbd_release_ee(mdev,&mdev->active_ee); + if(rr) ERR("%d EEs in active list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->sync_ee); + if(rr) ERR("%d EEs in sync list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->read_ee); + if(rr) ERR("%d EEs in read list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->done_ee); + if(rr) ERR("%d EEs in done list found!\n",rr); + + rr = drbd_release_ee(mdev,&mdev->net_ee); + if(rr) ERR("%d EEs in net list found!\n",rr); + + ERR_IF (!list_empty(&mdev->data.work.q)) { + struct list_head *lp; + list_for_each(lp,&mdev->data.work.q) { + DUMPP(lp); + } + }; + D_ASSERT(mdev->ee_vacant == 0); + + if (mdev->md_io_page) + __free_page(mdev->md_io_page); + + if (mdev->md_io_tmpp) + __free_page(mdev->md_io_tmpp); + + if (mdev->act_log) lc_free(mdev->act_log); + } + drbd_destroy_mempools(); + } + +#ifndef HAVE_COMPAT_IOCTL_MEMBER +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) || defined(CONFIG_X86_64) + lock_kernel(); + unregister_ioctl32_conversion(DRBD_IOCTL_GET_VERSION); + unregister_ioctl32_conversion(DRBD_IOCTL_SET_STATE); + unregister_ioctl32_conversion(DRBD_IOCTL_SET_DISK_CONFIG); + unregister_ioctl32_conversion(DRBD_IOCTL_SET_NET_CONFIG); + unregister_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_NET); + unregister_ioctl32_conversion(DRBD_IOCTL_GET_CONFIG); + unregister_ioctl32_conversion(DRBD_IOCTL_INVALIDATE); + unregister_ioctl32_conversion(DRBD_IOCTL_INVALIDATE_REM); + unregister_ioctl32_conversion(DRBD_IOCTL_SET_SYNC_CONFIG); + unregister_ioctl32_conversion(DRBD_IOCTL_SET_DISK_SIZE); + unregister_ioctl32_conversion(DRBD_IOCTL_WAIT_CONNECT); + unregister_ioctl32_conversion(DRBD_IOCTL_WAIT_SYNC); + unregister_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_DISK); + unregister_ioctl32_conversion(DRBD_IOCTL_SET_STATE_FLAGS); + unlock_kernel(); +#endif +#endif + +NOT_IN_26( + blksize_size[MAJOR_NR] = NULL; + blk_size[MAJOR_NR] = NULL; + // kfree(NULL) is noop + kfree(drbd_blocksizes); + kfree(drbd_sizes); +) + kfree(drbd_conf); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + devfs_unregister(devfs_handle); +#else + devfs_remove(drbd_devfs_name); +#endif + + if (unregister_blkdev(MAJOR_NR, DEVICE_NAME) != 0) + printk(KERN_ERR DEVICE_NAME": unregister of device failed\n"); + + printk(KERN_INFO DEVICE_NAME": module cleanup done.\n"); +} + +int sizeof_drbd_structs_sanity_check(void); +int __init drbd_init(void) +{ + int i,err; + +#if 0 +#warning "DEBUGGING" +/* I am too lazy to calculate this by hand -lge + */ +#define SZO(x) printk(KERN_ERR "sizeof(" #x ") = %d\n", sizeof(x)) + SZO(struct Drbd_Conf); + SZO(struct buffer_head); + SZO(Drbd_Polymorph_Packet); + SZO(struct drbd_socket); + SZO(struct bm_extent); + SZO(struct lc_element); + SZO(struct semaphore); + SZO(struct drbd_request); + SZO(struct bio); + SZO(wait_queue_head_t); + SZO(spinlock_t); + SZO(Drbd_Header); + SZO(Drbd_HandShake_Packet); + SZO(Drbd_Barrier_Packet); + SZO(Drbd_BarrierAck_Packet); + SZO(Drbd_SyncParam_Packet); + SZO(Drbd_Parameter_Packet); + SZO(Drbd06_Parameter_P); + SZO(Drbd_Data_Packet); + SZO(Drbd_BlockAck_Packet); + printk(KERN_ERR "AL_EXTENTS_PT = %d\n",AL_EXTENTS_PT); + printk(KERN_ERR "DRBD_MAX_SECTORS = %llu\n",DRBD_MAX_SECTORS); + return -EBUSY; +#endif + + if (sizeof(Drbd_HandShake_Packet) != 80) { + printk(KERN_ERR DEVICE_NAME + ": never change the size or layout of the HandShake packet.\n"); + return -EINVAL; + } + if (sizeof_drbd_structs_sanity_check()) { + return -EINVAL; + } + + if (use_nbd_major) { + major_nr = NBD_MAJOR; + } + + if (1 > minor_count||minor_count > 255) { + printk(KERN_ERR DEVICE_NAME + ": invalid minor_count (%d)\n",minor_count); +#ifdef MODULE + return -EINVAL; +#else + minor_count = 8; +#endif + } + + err = register_blkdev(MAJOR_NR, DEVICE_NAME + NOT_IN_26(, &drbd_ops) + ); + if (err) { + printk(KERN_ERR DEVICE_NAME + ": unable to register block device major %d\n", + MAJOR_NR); + return err; + } + + drbd_devfs_name = (major_nr == NBD_MAJOR) ? "nbd" : "drbd"; + + /* + * allocate all necessary structs + */ + err = -ENOMEM; + + drbd_proc = NULL; // play safe for drbd_cleanup + drbd_conf = kmalloc(sizeof(drbd_dev)*minor_count,GFP_KERNEL); + if (likely(drbd_conf!=NULL)) + memset(drbd_conf,0,sizeof(drbd_dev)*minor_count); + else goto Enomem; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + drbd_sizes = kmalloc(sizeof(int)*minor_count,GFP_KERNEL); + if (likely(drbd_sizes!=NULL)) + memset(drbd_sizes,0,sizeof(int)*minor_count); + else goto Enomem; + drbd_blocksizes = kmalloc(sizeof(int)*minor_count,GFP_KERNEL); + if (unlikely(!drbd_blocksizes)) goto Enomem; +#else + + devfs_mk_dir(drbd_devfs_name); + + for (i = 0; i < minor_count; i++) { + drbd_dev *mdev = drbd_conf + i; + struct gendisk *disk; + request_queue_t *q; + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) goto Enomem; + mdev->rq_queue = q; + q->queuedata = mdev; + + disk = alloc_disk(1); + if (!disk) goto Enomem; + mdev->vdisk = disk; + + set_disk_ro( disk, TRUE ); + + disk->queue = q; + disk->major = MAJOR_NR; + disk->first_minor = i; + disk->fops = &drbd_ops; + sprintf(disk->disk_name, DEVICE_NAME "%d", i); + sprintf(disk->devfs_name, "%s/%d", drbd_devfs_name, i); + disk->private_data = mdev; + add_disk(disk); + + mdev->this_bdev = bdget(MKDEV(MAJOR_NR,i)); + // we have no partitions. we contain only ourselves. + mdev->this_bdev->bd_contains = mdev->this_bdev; + if (bd_claim(mdev->this_bdev,drbd_sec_holder)) { + // Initial we are Secondary -> should claim myself. + WARN("Could not bd_claim() myself."); + } else if (disable_bd_claim) { + bd_release(mdev->this_bdev); + } + + blk_queue_make_request(q,drbd_make_request_26); + q->queue_lock = &mdev->req_lock; // needed since we use + // plugging on a queue, that actually has no requests! + q->unplug_fn = drbd_unplug_fn; + } +#endif + + if ((err = drbd_create_mempools())) + goto Enomem; + + for (i = 0; i < minor_count; i++) { + drbd_dev *mdev = &drbd_conf[i]; + struct page *page = alloc_page(GFP_KERNEL); + + drbd_init_set_defaults(mdev); + +NOT_IN_26( + drbd_blocksizes[i] = INITIAL_BLOCK_SIZE; + mdev->this_bdev = MKDEV(MAJOR_NR, i); + set_device_ro( MKDEV(MAJOR_NR, i), TRUE ); +) + + if(!page) goto Enomem; + mdev->md_io_page = page; + + if (drbd_bm_init(mdev)) goto Enomem; + // no need to lock access, we are still initializing the module. + mdev->resync = lc_alloc(17, sizeof(struct bm_extent),mdev); + if (!mdev->resync) goto Enomem; + mdev->act_log = lc_alloc(mdev->sync_conf.al_extents, + sizeof(struct lc_element), mdev); + if (!mdev->act_log) goto Enomem; + + init_MUTEX(&mdev->device_mutex); + if (!tl_init(mdev)) goto Enomem; + if (!drbd_init_ee(mdev)) goto Enomem; + } + +#if CONFIG_PROC_FS + /* + * register with procfs + */ + drbd_proc = create_proc_entry("drbd", S_IFREG | S_IRUGO , &proc_root); + + if (!drbd_proc) { + printk(KERN_ERR DEVICE_NAME": unable to register proc file\n"); + goto Enomem; + } + + drbd_proc->proc_fops = &drbd_proc_fops; + drbd_proc->owner = THIS_MODULE; +#else +# error "Currently drbd depends on the proc file system (CONFIG_PROC_FS)" +#endif +NOT_IN_26( + blksize_size[MAJOR_NR] = drbd_blocksizes; + blk_size[MAJOR_NR] = drbd_sizes; +) + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + devfs_handle = devfs_mk_dir (NULL, drbd_devfs_name, NULL); + devfs_register_series(devfs_handle, "%u", minor_count, + DEVFS_FL_DEFAULT, MAJOR_NR, 0, + S_IFBLK | S_IRUSR | S_IWUSR, + &drbd_ops, NULL); +#endif + + NOT_IN_26(blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR),drbd_make_request_24);) + +#ifndef HAVE_COMPAT_IOCTL_MEMBER +#if defined(CONFIG_PPC64) || defined(CONFIG_SPARC64) || defined(CONFIG_X86_64) + // tell the kernel that we think our ioctls are 64bit clean + lock_kernel(); + register_ioctl32_conversion(DRBD_IOCTL_GET_VERSION,NULL); + register_ioctl32_conversion(DRBD_IOCTL_SET_STATE,NULL); + register_ioctl32_conversion(DRBD_IOCTL_SET_DISK_CONFIG,NULL); + register_ioctl32_conversion(DRBD_IOCTL_SET_NET_CONFIG,NULL); + register_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_NET,NULL); + register_ioctl32_conversion(DRBD_IOCTL_GET_CONFIG,NULL); + register_ioctl32_conversion(DRBD_IOCTL_INVALIDATE,NULL); + register_ioctl32_conversion(DRBD_IOCTL_INVALIDATE_REM,NULL); + register_ioctl32_conversion(DRBD_IOCTL_SET_SYNC_CONFIG,NULL); + register_ioctl32_conversion(DRBD_IOCTL_SET_DISK_SIZE,NULL); + register_ioctl32_conversion(DRBD_IOCTL_WAIT_CONNECT,NULL); + register_ioctl32_conversion(DRBD_IOCTL_WAIT_SYNC,NULL); + register_ioctl32_conversion(DRBD_IOCTL_UNCONFIG_DISK,NULL); + register_ioctl32_conversion(DRBD_IOCTL_SET_STATE_FLAGS,NULL); + unlock_kernel(); +#endif +#endif + + printk(KERN_INFO DEVICE_NAME ": initialised. " + "Version: " REL_VERSION " (api:%d/proto:%d)\n", + API_VERSION,PRO_VERSION); + printk(KERN_INFO DEVICE_NAME ": %s\n", drbd_buildtag()); + if (use_nbd_major) { + printk(KERN_INFO DEVICE_NAME": hijacking NBD device major!\n"); + } + printk(KERN_INFO DEVICE_NAME": registered as block device major %d\n", MAJOR_NR); + + return 0; // Success! + + Enomem: + drbd_cleanup(); + if (err == -ENOMEM) // currently always the case + printk(KERN_ERR DEVICE_NAME ": ran out of memory\n"); + else + printk(KERN_ERR DEVICE_NAME ": initialization failure\n"); + return err; +} + +void drbd_free_ll_dev(drbd_dev *mdev) +{ + struct file *lo_file; + + lo_file = mdev->lo_file; + mdev->lo_file = 0; + wmb(); + + if (lo_file) { +NOT_IN_26( + blkdev_put(lo_file->f_dentry->d_inode->i_bdev,BDEV_FILE); + blkdev_put(mdev->md_file->f_dentry->d_inode->i_bdev,BDEV_FILE); +) +ONLY_IN_26( + bd_release(mdev->backing_bdev); + bd_release(mdev->md_bdev); +) + mdev->md_bdev = + mdev->backing_bdev = 0; + + fput(lo_file); + fput(mdev->md_file); + // mdev->lo_file = 0; + mdev->md_file = 0; + } +} + +void drbd_free_sock(drbd_dev *mdev) +{ + if (mdev->data.socket) { + sock_release(mdev->data.socket); + mdev->data.socket = 0; + } + if (mdev->meta.socket) { + sock_release(mdev->meta.socket); + mdev->meta.socket = 0; + } +} + + +void drbd_free_resources(drbd_dev *mdev) +{ + drbd_free_sock(mdev); + drbd_free_ll_dev(mdev); +} + +/*********************************/ +/* meta data management */ + +struct meta_data_on_disk { + u64 la_size; // last agreed size. + u32 gc[GEN_CNT_SIZE]; // generation counter + u32 magic; + u32 md_size; + u32 al_offset; // offset to this block + u32 al_nr_extents; // important for restoring the AL + u32 bm_offset; // offset to the bitmap, from here +} __attribute((packed)); + +/* + +FIXME md_io might fail unnoticed sometimes ... + +*/ +void drbd_md_write(drbd_dev *mdev) +{ + struct meta_data_on_disk * buffer; + u32 flags; + sector_t sector; + int i; + + ERR_IF(!inc_local_md_only(mdev)) return; + + down(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + memset(buffer,0,512); + + flags = mdev->gen_cnt[Flags] & ~(MDF_PrimaryInd|MDF_ConnectedInd); + if (mdev->state == Primary) flags |= MDF_PrimaryInd; + if (mdev->cstate >= WFReportParams) flags |= MDF_ConnectedInd; + mdev->gen_cnt[Flags] = flags; + + for (i = Flags; i < GEN_CNT_SIZE; i++) + buffer->gc[i]=cpu_to_be32(mdev->gen_cnt[i]); + buffer->la_size=cpu_to_be64(drbd_get_capacity(mdev->this_bdev)>>1); + buffer->magic=cpu_to_be32(DRBD_MD_MAGIC); + + buffer->md_size = __constant_cpu_to_be32(MD_RESERVED_SIZE); + buffer->al_offset = __constant_cpu_to_be32(MD_AL_OFFSET); + buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); + + buffer->bm_offset = __constant_cpu_to_be32(MD_BM_OFFSET); + + sector = drbd_md_ss(mdev) + MD_GC_OFFSET; + +#if 0 + /* FIXME sooner or later I'd like to use the MD_DIRTY flag everywhere, + * so we can avoid unneccessary md writes. + */ + ERR_IF (!test_bit(MD_DIRTY,&mdev->flags)) { + dump_stack(); + } +#endif + + if (drbd_md_sync_page_io(mdev,sector,WRITE)) { + clear_bit(MD_DIRTY,&mdev->flags); + } else { + if (test_bit(DISKLESS,&mdev->flags)) { + /* this was a try anyways ... */ + ERR("meta data update failed!\n"); + } else { + /* If we cannot write our meta data, + * but we are supposed to be able to, + * tough! + */ + drbd_panic("meta data update failed!\n"); + } + } + + // why is this here?? please EXPLAIN. + mdev->la_size = drbd_get_capacity(mdev->this_bdev)>>1; + + up(&mdev->md_io_mutex); + dec_local(mdev); +} + +/* + * return: + * < 0 if we had an error (currently never ...) + * = 0 if we need a FullSync because either the flag is set, + * or the gen counts are invalid + * > 0 if we could read valid gen counts, + * and reading the bitmap and act log does make sense. + */ +int drbd_md_read(drbd_dev *mdev) +{ + struct meta_data_on_disk * buffer; + sector_t sector; + int i; + + if(!inc_local_md_only(mdev)) return -1; + + down(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + + sector = drbd_md_ss(mdev) + MD_GC_OFFSET; + +/* FIXME different failure cases: IO error or invalid magic */ + + ERR_IF( ! drbd_md_sync_page_io(mdev,sector,READ) ) goto err; + + if(be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) goto err; + + for(i=Flags;i<=ArbitraryCnt;i++) + mdev->gen_cnt[i]=be32_to_cpu(buffer->gc[i]); + mdev->la_size = be64_to_cpu(buffer->la_size); + mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); + if (mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + up(&mdev->md_io_mutex); + dec_local(mdev); + + return !drbd_md_test_flag(mdev,MDF_FullSync); + + err: + up(&mdev->md_io_mutex); + dec_local(mdev); + + INFO("Creating state block\n"); + + /* if we need to create a state block, we are + * not consistent, and need a sync of the full device! + * if one knows what he is doing, he can manipulate gcs by hand, + * and avoid the initial full sync... + * otherwise, one of us will have to be forced (--do-what-I-say) + * to be primary, before anything is usable. + */ + set_bit(MD_DIRTY,&mdev->flags); + mdev->gen_cnt[Flags] = MDF_FullSync; + for(i = HumanCnt; i < GEN_CNT_SIZE; i++) mdev->gen_cnt[i]=1; + +/* FIXME might have IO errors! */ + drbd_md_write(mdev); + + return 0; +} + +#if DUMP_MD >= 1 +#define MeGC(x) mdev->gen_cnt[x] +#define PeGC(x) be32_to_cpu(peer->gen_cnt[x]) + +void drbd_dump_md(drbd_dev *mdev, Drbd_Parameter_Packet *peer, int verbose) +{ + INFO("I am(%c): %c:%08x:%08x:%08x:%08x:%c%c\n", + mdev->state == Primary ? 'P':'S', + MeGC(Flags) & MDF_Consistent ? '1' : '0', + MeGC(HumanCnt), + MeGC(TimeoutCnt), + MeGC(ConnectedCnt), + MeGC(ArbitraryCnt), + MeGC(Flags) & MDF_PrimaryInd ? '1' : '0', + MeGC(Flags) & MDF_ConnectedInd ? '1' : '0'); + if (peer) { + INFO("Peer(%c): %c:%08x:%08x:%08x:%08x:%c%c\n", + be32_to_cpu(peer->state) == Primary ? 'P':'S', + PeGC(Flags) & MDF_Consistent ? '1' : '0', + PeGC(HumanCnt), + PeGC(TimeoutCnt), + PeGC(ConnectedCnt), + PeGC(ArbitraryCnt), + PeGC(Flags) & MDF_PrimaryInd ? '1' : '0', + PeGC(Flags) & MDF_ConnectedInd ? '1' : '0'); + } else { + INFO("Peer Unknown.\n"); + } + if (verbose) { + /* TODO + * dump activity log and bitmap summary, + * and maybe other statistics + */ + } +} + +#undef MeGC +#undef PeGC +#else +void drbd_dump_md(drbd_dev *mdev, Drbd_Parameter_Packet *peer, int verbose) +{ /* do nothing */ } +#endif + +// Returns 1 if I have the good bits, +// 0 if both are nice +// -1 if the partner has the good bits. +int drbd_md_compare(drbd_dev *mdev,Drbd_Parameter_Packet *partner) +{ + int i; + u32 me,other; + + /* FIXME + * we should not only rely on the consistent bit, but at least check + * whether the rest of the gencounts is plausible, to detect a previous + * split brain situation, and refuse anything until we are told + * otherwise! + * + * And we should refuse to become SyncSource if we are not consistent! + * + * though DRBD is not to blame for it, + * someone eventually will try to blame it ... + */ + + me=mdev->gen_cnt[Flags] & MDF_Consistent; + other=be32_to_cpu(partner->gen_cnt[Flags]) & MDF_Consistent; + if( me > other ) return 1; + if( me < other ) return -1; + + for(i=HumanCnt;i<=ArbitraryCnt;i++) { + me=mdev->gen_cnt[i]; + other=be32_to_cpu(partner->gen_cnt[i]); + if( me > other ) return 1; + if( me < other ) return -1; + } + + me=mdev->gen_cnt[Flags] & MDF_PrimaryInd; + other=be32_to_cpu(partner->gen_cnt[Flags]) & MDF_PrimaryInd; + if( me > other ) return 1; + if( me < other ) return -1; + + return 0; +} + +/* THINK do these have to be protected by some lock ? */ +void drbd_md_inc(drbd_dev *mdev, enum MetaDataIndex order) +{ + set_bit(MD_DIRTY,&mdev->flags); + mdev->gen_cnt[order]++; +} +void drbd_md_set_flag(drbd_dev *mdev, int flag) +{ + if ( (mdev->gen_cnt[Flags] & flag) != flag) { + set_bit(MD_DIRTY,&mdev->flags); + mdev->gen_cnt[Flags] |= flag; + } +} +void drbd_md_clear_flag(drbd_dev *mdev, int flag) +{ + if ( (mdev->gen_cnt[Flags] & flag) != 0 ) { + set_bit(MD_DIRTY,&mdev->flags); + mdev->gen_cnt[Flags] &= ~flag; + } +} +int drbd_md_test_flag(drbd_dev *mdev, int flag) +{ + return ((mdev->gen_cnt[Flags] & flag) != 0); +} + +module_init(drbd_init) +module_exit(drbd_cleanup) --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_proc.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,294 @@ +/* +-*- linux-c -*- + drbd_proc.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 1999-2004, Philipp Reisner . + main author. + + Copyright (C) 2002-2004, Lars Ellenberg . + main contributor. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" + +STATIC int drbd_proc_open(struct inode *inode, struct file *file); +STATIC int drbd_seq_show(struct seq_file *seq, void *v); + + +struct proc_dir_entry *drbd_proc; +struct file_operations drbd_proc_fops = { + .owner = THIS_MODULE, + .open = drbd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +// We ommit single_open and single_release, since that is only available +// after 2.4.23 +static void *single_start(struct seq_file *p, loff_t *pos) +{ + return NULL + (*pos == 0); +} + +static void *single_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + return NULL; +} + +static void single_stop(struct seq_file *p, void *v) +{ +} + +struct seq_operations drbd_proc_seq_ops = { + .start = single_start, + .next = single_next, + .stop = single_stop, + .show = drbd_seq_show, +}; + +/*lge + * progress bars shamelessly adapted from driver/md/md.c + * output looks like + * [=====>..............] 33.5% (23456/123456) + * finish: 2:20:20 speed: 6,345 (6,456) K/sec + */ +STATIC void drbd_syncer_progress(struct Drbd_Conf* mdev, struct seq_file *seq) +{ + unsigned long res , db, dt, dbdt, rt, rs_left; + + /* the whole sector_div thingy was wrong (did overflow, + * did not use correctly typed parameters), and is not even + * neccessary as long as rs_total and drbd_bm_total_weight + * are both unsigned long. + * + * this is to break it at compile time when we change that + * (we may feel 4TB maximum storage per drbd is not enough) + */ + typecheck(unsigned long, mdev->rs_total); + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + rs_left = drbd_bm_total_weight(mdev); + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (rs_left > mdev->rs_total) { + /* doh. logic bug somewhere. + * for now, just try to prevent in-kernel buffer overflow. + */ + ERR("logic bug? rs_left=%lu > rs_total=%lu\n", + rs_left, mdev->rs_total); + res = 1000; + } else { + res = (rs_left >> 10)*1000/((mdev->rs_total >> 10) + 1); + } + { + int i, y = res/50, x = 20-y; + seq_printf(seq, "\t["); + for (i = 1; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + } + res = 1000L - res; + seq_printf(seq,"sync'ed:%3lu.%lu%% ", res / 10, res % 10); + /* if more than 1 GB display in MB */ + if (mdev->rs_total > 0x100000L) { + seq_printf(seq,"(%lu/%lu)M\n\t", + (unsigned long) Bit2KB(rs_left) >> 10, + (unsigned long) Bit2KB(mdev->rs_total) >> 10 ); + } else { + seq_printf(seq,"(%lu/%lu)K\n\t", + (unsigned long) Bit2KB(rs_left), + (unsigned long) Bit2KB(mdev->rs_total) ); + } + + /* see drivers/md/md.c + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = (jiffies - mdev->rs_mark_time) / HZ; + + if (dt > 20) { + /* if we made no update to rs_mark_time for too long, + * we are stalled. show that. */ + seq_printf(seq, "stalled\n"); + return; + } + + if (!dt) dt++; + db = mdev->rs_mark_left - rs_left; + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ + + seq_printf(seq, "finish: %lu:%02lu:%02lu", + rt / 3600, (rt % 3600) / 60, rt % 60); + + /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " speed: %ld,%03ld", + dbdt/1000,dbdt % 1000); + else + seq_printf(seq, " speed: %ld", dbdt); + + /* mean speed since syncer started + * we do account for PausedSync periods */ + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) dt=1; + db = mdev->rs_total - rs_left; + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " (%ld,%03ld)", + dbdt/1000,dbdt % 1000); + else + seq_printf(seq, " (%ld)", dbdt); + + seq_printf(seq," K/sec\n"); +} + +const char* cstate_to_name(Drbd_CState s) { + static const char *cstate_names[] = { + [Unconfigured] = "Unconfigured", + [StandAlone] = "StandAlone", + [Unconnected] = "Unconnected", + [Timeout] = "Timeout", + [BrokenPipe] = "BrokenPipe", + [NetworkFailure] = "NetworkFailure", + [WFConnection] = "WFConnection", + [WFReportParams] = "WFReportParams", + [Connected] = "Connected", + [SkippedSyncS] = "SkippedSyncS", + [SkippedSyncT] = "SkippedSyncT", + [WFBitMapS] = "WFBitMapS", + [WFBitMapT] = "WFBitMapT", + [SyncSource] = "SyncSource", + [SyncTarget] = "SyncTarget", + [PausedSyncS] = "PausedSyncS", + [PausedSyncT] = "PausedSyncT", + }; + + return s < Unconfigured ? "TO_SMALL" : + s > PausedSyncT ? "TO_LARGE" + : cstate_names[s]; +} + +const char* nodestate_to_name(Drbd_State s) { + static const char *state_names[] = { + [Primary] = "Primary", + [Secondary] = "Secondary", + [Unknown] = "Unknown" + }; + + return s < Unknown ? "TO_SMALL" : + s > Secondary ? "TO_LARGE" + : state_names[s]; +} + + +STATIC int drbd_seq_show(struct seq_file *seq, void *v) +{ + int i; + const char *sn; + + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d)\n%s\n", + API_VERSION,PRO_VERSION, drbd_buildtag()); + + /* + cs .. connection state + st .. node state (local/remote) + ld .. local data consistentency + ns .. network send + nr .. network receive + dw .. disk write + dr .. disk read + pe .. pending (waiting for ack) + ua .. unack'd (still need to send ack) + al .. access log write count + */ + + for (i = 0; i < minor_count; i++) { + sn = cstate_to_name(drbd_conf[i].cstate); + if(drbd_conf[i].cstate == Connected) { + if(test_bit(DISKLESS,&drbd_conf[i].flags)) + sn = "DiskLessClient"; + if(test_bit(PARTNER_DISKLESS,&drbd_conf[i].flags)) + sn = "ServerForDLess"; + } + if ( drbd_conf[i].cstate == Unconfigured ) + seq_printf( seq, "%2d: cs:Unconfigured\n", i); + else + seq_printf( seq, + "%2d: cs:%s st:%s/%s ld:%s\n" + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " + "lo:%d pe:%d ua:%d ap:%d\n", + i, sn, + nodestate_to_name(drbd_conf[i].state), + nodestate_to_name(drbd_conf[i].o_state), + (drbd_conf[i].gen_cnt[Flags] + & MDF_Consistent) ? "Consistent" : "Inconsistent", + // FIXME partner consistent? + drbd_conf[i].send_cnt/2, + drbd_conf[i].recv_cnt/2, + drbd_conf[i].writ_cnt/2, + drbd_conf[i].read_cnt/2, + drbd_conf[i].al_writ_cnt, + drbd_conf[i].bm_writ_cnt, + atomic_read(&drbd_conf[i].local_cnt), + atomic_read(&drbd_conf[i].ap_pending_cnt) + + atomic_read(&drbd_conf[i].rs_pending_cnt), + atomic_read(&drbd_conf[i].unacked_cnt), + atomic_read(&drbd_conf[i].ap_bio_cnt) + ); + + if ( drbd_conf[i].cstate == SyncSource || + drbd_conf[i].cstate == SyncTarget ) + drbd_syncer_progress(drbd_conf+i,seq); + } + + return 0; +} + +STATIC int drbd_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &drbd_proc_seq_ops); +} + +/* PROC FS stuff end */ --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_receiver.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,2414 @@ +/* +-*- linux-c -*- + drbd_receiver.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 1999-2004, Philipp Reisner . + main author. + + Copyright (C) 2002-2004, Lars Ellenberg . + main contributor. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) || defined(HAVE_MM_INLINE_H) +#include +#endif +#include +#include +#include +#define __KERNEL_SYSCALLS__ +#include +#include +#include +#include "drbd_int.h" + +#define EE_MININUM 32 // @4k pages => 128 KByte + +#define is_syncer_blk(A,B) ((B)==ID_SYNCER) + +#if defined(__arch_um__) && !defined(HAVE_UML_TO_VIRT) +static inline void *to_virt(unsigned long phys) +{ + return((void *) uml_physmem + phys); +} +#endif + +#ifdef DBG_ASSERTS +void drbd_assert_breakpoint(drbd_dev *mdev, char *exp, + char *file, int line) +{ + ERR("ASSERT( %s ) in %s:%d\n", exp, file, line); +} +#endif + + +#if 0 +#define CHECK_LIST_LIMIT 1000 +void check_list(drbd_dev *mdev,struct list_head *list,char *t) +{ + struct list_head *le,*la; + int forward=0,backward=0; + + le=list; + do { + la=le; + le=le->next; + if( le->prev != la ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s list fucked.\n", + (int)(mdev-drbd_conf),t); + break; + } + if( forward++ > CHECK_LIST_LIMIT ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s forward > 1000\n", + (int)(mdev-drbd_conf),t); + break; + } + } while(le != list); + + le=list; + do { + la=le; + le=le->prev; + if( le->next != la ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s list fucked.\n", + (int)(mdev-drbd_conf),t); + break; + } + if( backward++ > CHECK_LIST_LIMIT ) { + printk(KERN_ERR DEVICE_NAME + "%d: %s backward > 1000\n", + (int)(mdev-drbd_conf),t); + break; + } + } while(le != list); + + if(forward != backward) { + printk(KERN_ERR DEVICE_NAME "%d: forward=%d, backward=%d\n", + (int)(mdev-drbd_conf),forward,backward); + } +} +#endif + +#if 0 +STATIC inline int is_syncer_blk(drbd_dev *mdev, u64 block_id) +{ + if ( block_id == ID_SYNCER ) return 1; + /* Use this code if you are working with a VIA based mboard :) */ + if ( (long)block_id == (long)-1) { + printk(KERN_ERR DEVICE_NAME + "%d: strange block_id %lx%lx\n",(int)(mdev-drbd_conf), + (unsigned long)(block_id>>32), + (unsigned long)block_id); + return 1; + } + return 0; +} +#endif //PARANOIA + +/* +You need to hold the ee_lock: + drbd_free_ee() + drbd_get_ee() + drbd_put_ee() + _drbd_process_ee() + +You must not have the ee_lock: + _drbd_alloc_ee() + drbd_alloc_ee() + drbd_init_ee() + drbd_release_ee() + drbd_ee_fix_bhs() + drbd_process_ee() + drbd_clear_done_ee() + drbd_wait_ee() +*/ + +STATIC int _drbd_alloc_ee(drbd_dev *mdev,struct page* page,int mask) +{ + struct Tl_epoch_entry* e; + + /* kmem_cache does not like to mix different memory types. + * so even if we alloc'ed the page from HIGHMEM, + * the ee comes from normal memory. + */ + e = kmem_cache_alloc(drbd_ee_cache, mask & ~(__GFP_HIGHMEM)); + if( e == NULL ) return FALSE; + + drbd_ee_init(e,page); + spin_lock_irq(&mdev->ee_lock); + list_add(&e->w.list,&mdev->free_ee); + mdev->ee_vacant++; + spin_unlock_irq(&mdev->ee_lock); + + return TRUE; +} + +/* bool */ +STATIC int drbd_alloc_ee(drbd_dev *mdev,int mask) +{ + struct page *page; + + page=alloc_page(mask); + if(!page) return FALSE; + + /* if we got the page, we really want the ee, too, + * even for "GFP_TRY". + * we may wait, but better not cause IO, + * we might be in the IO path (of our peer). + */ + if(!_drbd_alloc_ee(mdev,page,mask | GFP_NOIO)) { + __free_page(page); + return FALSE; + } + + return TRUE; +} + +STATIC struct page* drbd_free_ee(drbd_dev *mdev, struct list_head *list) +{ + struct list_head *le; + struct Tl_epoch_entry* e; + struct page* page; + + MUST_HOLD(&mdev->ee_lock); + + D_ASSERT(!list_empty(list)); + le = list->next; + e = list_entry(le, struct Tl_epoch_entry, w.list); + list_del(le); + + page = drbd_bio_get_page(&e->private_bio); +ONLY_IN_26( + D_ASSERT(page == e->ee_bvec.bv_page); + page = e->ee_bvec.bv_page; +) + kmem_cache_free(drbd_ee_cache, e); + mdev->ee_vacant--; + + return page; +} + +int drbd_init_ee(drbd_dev *mdev) +{ + while(mdev->ee_vacant < EE_MININUM ) { + if(!drbd_alloc_ee(mdev,GFP_USER)) { + ERR("Failed to allocate %d EEs !\n",EE_MININUM); + return 0; + } + } + return 1; +} + +int drbd_release_ee(drbd_dev *mdev,struct list_head* list) +{ + int count=0; + + spin_lock_irq(&mdev->ee_lock); + while(!list_empty(list)) { + __free_page(drbd_free_ee(mdev,list)); + count++; + } + spin_unlock_irq(&mdev->ee_lock); + + return count; +} + +#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0) +#define GFP_TRY ( __GFP_HIGHMEM | __GFP_NOWARN ) +#else +#define GFP_TRY ( __GFP_HIGHMEM ) +#endif + +STATIC int _drbd_process_ee(drbd_dev *mdev, int be_sleepy); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +STATIC void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} + +STATIC void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); +} + +#define DEFINE_WAIT(name) \ + wait_queue_t name = { \ + .task = current, \ + .task_list = { .next = &name.task_list, \ + .prev = &name.task_list, \ + }, \ + } + +#endif + +/** + * drbd_get_ee: Returns an Tl_epoch_entry; might sleep. Fails only if + * a signal comes in. + */ +struct Tl_epoch_entry* drbd_get_ee(drbd_dev *mdev) +{ + struct list_head *le; + struct Tl_epoch_entry* e; + DEFINE_WAIT(wait); + + MUST_HOLD(&mdev->ee_lock); + + if(mdev->ee_vacant == EE_MININUM / 2) { + spin_unlock_irq(&mdev->ee_lock); + drbd_kick_lo(mdev); + spin_lock_irq(&mdev->ee_lock); + } + + if(list_empty(&mdev->free_ee)) _drbd_process_ee(mdev,1); + + if(list_empty(&mdev->free_ee)) { + for (;;) { + prepare_to_wait(&mdev->ee_wait, &wait, + TASK_INTERRUPTIBLE); + if(!list_empty(&mdev->free_ee)) break; + spin_unlock_irq(&mdev->ee_lock); + if( ( mdev->ee_vacant+mdev->ee_in_use) < + mdev->conf.max_buffers ) { + if(drbd_alloc_ee(mdev,GFP_TRY)) { + /* race race race + * (currently harmless for drbd07, since drbd_get_ee is called by + * receiver_thread only. solved with different implementation in + * drbd-plus already.) + */ + spin_lock_irq(&mdev->ee_lock); + break; + } + } + drbd_kick_lo(mdev); + schedule(); + spin_lock_irq(&mdev->ee_lock); + finish_wait(&mdev->ee_wait, &wait); + if (signal_pending(current)) { + WARN("drbd_get_ee interrupted!\n"); + return 0; + } + // finish wait is inside, so that we are TASK_RUNNING + // in _drbd_process_ee (which might sleep by itself.) + _drbd_process_ee(mdev,1); + } + finish_wait(&mdev->ee_wait, &wait); + } + + /* race race race */ + le=mdev->free_ee.next; + list_del(le); + mdev->ee_vacant--; + mdev->ee_in_use++; + e=list_entry(le, struct Tl_epoch_entry, w.list); +ONLY_IN_26( + D_ASSERT(e->private_bio.bi_idx == 0); + drbd_ee_init(e,e->ee_bvec.bv_page); // reinitialize +) + e->block_id = !ID_VACANT; + SET_MAGIC(e); + return e; +} + +void drbd_put_ee(drbd_dev *mdev,struct Tl_epoch_entry *e) +{ + struct page* page; + + MUST_HOLD(&mdev->ee_lock); + + D_ASSERT(page_count(drbd_bio_get_page(&e->private_bio)) == 1); + + mdev->ee_in_use--; + mdev->ee_vacant++; + e->block_id = ID_VACANT; + INVALIDATE_MAGIC(e); + list_add_tail(&e->w.list,&mdev->free_ee); + + if((mdev->ee_vacant * 2 > mdev->ee_in_use ) && + ( mdev->ee_vacant + mdev->ee_in_use > EE_MININUM) ) { + // FIXME cleanup: never returns NULL anymore + page=drbd_free_ee(mdev,&mdev->free_ee); + if( page ) __free_page(page); + } + if(mdev->ee_in_use == 0) { + while( mdev->ee_vacant > EE_MININUM ) { + __free_page(drbd_free_ee(mdev,&mdev->free_ee)); + } + } + + wake_up(&mdev->ee_wait); +} + +STATIC void reclaim_net_ee(drbd_dev *mdev) +{ + struct Tl_epoch_entry *e; + struct list_head *le,*tle; + + /* The EEs are always appended to the end of the list, since + they are sent in order over the wire, they have to finish + in order. As soon as we see the first not finished we can + stop to examine the list... */ + + list_for_each_safe(le, tle, &mdev->net_ee) { + e = list_entry(le, struct Tl_epoch_entry, w.list); + if( page_count(drbd_bio_get_page(&e->private_bio)) > 1 ) break; + list_del(le); + drbd_put_ee(mdev,e); + } +} + + +/* It is important that the head list is really empty when returning, + from this function. Note, this function is called from all three + threads (receiver, worker and asender). To ensure this I only allow + one thread at a time in the body of the function */ +STATIC int _drbd_process_ee(drbd_dev *mdev, int be_sleepy) +{ + struct Tl_epoch_entry *e; + struct list_head *head = &mdev->done_ee; + struct list_head *le; + int ok=1; + int got_sig; + + MUST_HOLD(&mdev->ee_lock); + + reclaim_net_ee(mdev); + + if( test_and_set_bit(PROCESS_EE_RUNNING,&mdev->flags) ) { + if(!be_sleepy) { + return 3; + } + spin_unlock_irq(&mdev->ee_lock); + got_sig = wait_event_interruptible(mdev->ee_wait, + test_and_set_bit(PROCESS_EE_RUNNING,&mdev->flags) == 0); + spin_lock_irq(&mdev->ee_lock); + if(got_sig) return 2; + } + + while(!list_empty(head)) { + le = head->next; + list_del(le); + spin_unlock_irq(&mdev->ee_lock); + e = list_entry(le, struct Tl_epoch_entry, w.list); + ok = ok && e->w.cb(mdev,&e->w,0); + spin_lock_irq(&mdev->ee_lock); + drbd_put_ee(mdev,e); + } + + clear_bit(PROCESS_EE_RUNNING,&mdev->flags); + wake_up(&mdev->ee_wait); + + return ok; +} + +STATIC int drbd_process_ee(drbd_dev *mdev, int be_sleepy) +{ + int rv; + spin_lock_irq(&mdev->ee_lock); + rv=_drbd_process_ee(mdev,be_sleepy); + spin_unlock_irq(&mdev->ee_lock); + return rv; +} + +STATIC void drbd_clear_done_ee(drbd_dev *mdev) +{ + struct list_head *le; + struct Tl_epoch_entry *e; + int n = 0; + + spin_lock_irq(&mdev->ee_lock); + + reclaim_net_ee(mdev); + + while(!list_empty(&mdev->done_ee)) { + le = mdev->done_ee.next; + list_del(le); + e = list_entry(le, struct Tl_epoch_entry, w.list); + if(mdev->conf.wire_protocol == DRBD_PROT_C || + is_syncer_blk(mdev,e->block_id)) { + ++n; + } + drbd_put_ee(mdev,e); + } + + spin_unlock_irq(&mdev->ee_lock); + + sub_unacked(mdev, n); +} + + +static inline int _wait_ee_cond(struct Drbd_Conf* mdev,struct list_head *head) +{ + int rv; + spin_lock_irq(&mdev->ee_lock); + rv = list_empty(head); + spin_unlock_irq(&mdev->ee_lock); + if(!rv) drbd_kick_lo(mdev); + return rv; +} + +void drbd_wait_ee(drbd_dev *mdev,struct list_head *head) +{ + wait_event(mdev->ee_wait,_wait_ee_cond(mdev,head)); +} + +STATIC struct socket* drbd_accept(drbd_dev *mdev,struct socket* sock) +{ + struct socket *newsock; + int err = 0; + + err = sock->ops->listen(sock, 5); + if (err) + goto out; + + if (sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock)) + goto out; + + newsock->type = sock->type; + newsock->ops = sock->ops; + + err = newsock->ops->accept(sock, newsock, 0); + if (err < 0) + goto out_release; + + return newsock; + + out_release: + sock_release(newsock); + out: + if(err != -EAGAIN && err != -EINTR) + ERR("accept failed! %d\n", err); + return 0; +} + +STATIC int drbd_recv_short(drbd_dev *mdev, void *buf, size_t size) +{ + mm_segment_t oldfs; + struct iovec iov; + struct msghdr msg; + int rv; + + if (unlikely(drbd_did_panic == DRBD_MAGIC)) { + drbd_suicide(); + } + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = buf; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + rv = sock_recvmsg(mdev->meta.socket, &msg, size, msg.msg_flags); + + set_fs(oldfs); + + return rv; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +# define SK_(x) x +#else +# define SK_(x) sk_ ## x +#endif + +int drbd_recv(drbd_dev *mdev,void *buf, size_t size) +{ + mm_segment_t oldfs; + struct iovec iov; + struct msghdr msg; + int rv; + + if (unlikely(drbd_did_panic == DRBD_MAGIC)) { + drbd_suicide(); + } + + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iovlen = 1; + msg.msg_iov = &iov; + iov.iov_len = size; + iov.iov_base = buf; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + for(;;) { + rv = sock_recvmsg(mdev->data.socket,&msg,size,msg.msg_flags); + if (rv == size) break; + + /* Note: + * ECONNRESET other side closed the connection + * ERESTARTSYS (on sock) we got a signal + */ + + if (rv < 0) { + if (rv == -ECONNRESET) + INFO("sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + ERR("sock_recvmsg returned %d\n",rv); + break; + } else if (rv == 0) { + INFO("sock was shut down by peer\n"); + break; + } else { + /* signal came in, or peer/link went down, + * after we read a partial message + */ + // D_ASSERT(signal_pending(current)); + break; + } + }; + + set_fs(oldfs); + + if(rv != size) { + set_cstate(mdev,BrokenPipe); + drbd_thread_restart_nowait(&mdev->receiver); + } + + return rv; +} + +STATIC struct socket *drbd_try_connect(drbd_dev *mdev) +{ + int err; + struct socket *sock; + struct sockaddr_in src_in; + + err = sock_create(AF_INET, SOCK_STREAM, 0, &sock); + if (err) { + ERR("sock_creat(..)=%d\n", err); + return NULL; + } + + sock->sk->SK_(rcvtimeo) = + sock->sk->SK_(sndtimeo) = mdev->conf.try_connect_int*HZ; + + /* explicitly bind to the configured IP as source IP + for the outgoing connections. + This is needed for multihomed hosts and to be + able to use lo: interfaces for drbd. + Make sure to use 0 as portnumber, so linux selects + a free one dynamically. + */ + memcpy (&src_in, &(mdev->conf.my_addr), sizeof(struct sockaddr_in)); + src_in.sin_port = 0; + + err = sock->ops->bind(sock, + (struct sockaddr * ) &src_in, + sizeof (struct sockaddr_in)); + if (err) { + ERR("Unable to bind source sock (%d)\n", err); + sock_release(sock); + sock = NULL; + return sock; + } + + err = sock->ops->connect(sock, + (struct sockaddr *) mdev->conf.other_addr, + mdev->conf.other_addr_len, 0); + + if (err) { + sock_release(sock); + sock = NULL; + } + return sock; +} + +STATIC struct socket *drbd_wait_for_connect(drbd_dev *mdev) +{ + int err; + struct socket *sock,*sock2; + + err = sock_create(AF_INET, SOCK_STREAM, 0, &sock2); + if (err) { + ERR("sock_creat(..)=%d\n", err); + return NULL; + } + + sock2->sk->SK_(reuse) = 1; /* SO_REUSEADDR */ + sock2->sk->SK_(rcvtimeo) = + sock2->sk->SK_(sndtimeo) = mdev->conf.try_connect_int*HZ; + + err = sock2->ops->bind(sock2, + (struct sockaddr *) mdev->conf.my_addr, + mdev->conf.my_addr_len); + if (err) { + ERR("Unable to bind sock2 (%d)\n", err); + sock_release(sock2); + set_cstate(mdev,Unconnected); + return 0; + } + + sock = drbd_accept(mdev,sock2); + sock_release(sock2); + + return sock; +} + +STATIC int drbd_do_handshake(drbd_dev *mdev); + +/* + * return values: + * 1 yess, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +int drbd_connect(drbd_dev *mdev) +{ + struct socket *sock,*msock; + int h; + + D_ASSERT(mdev->cstate!=Unconfigured); + D_ASSERT(!mdev->data.socket); + + set_cstate(mdev,WFConnection); + + /* Break out of unknown connect loops by random wait here. */ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(net_random() % ((mdev->conf.try_connect_int*HZ)/4)); + + while(1) { + sock=drbd_try_connect(mdev); + if(sock) { + msock=drbd_wait_for_connect(mdev); + if(msock) break; + else sock_release(sock); + } else { + sock=drbd_wait_for_connect(mdev); + if(sock) { + int retry; + for (retry=1; retry <= 10; retry++) { + // give the other side time to call + // bind() & listen() + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + msock=drbd_try_connect(mdev); + if(msock) goto connected; + ERR("msock try_connect %d\n",retry); + } + sock_release(sock); + } + } + if(mdev->cstate==Unconnected) return -1; + if(signal_pending(current)) { + drbd_flush_signals(current); + smp_rmb(); + if (get_t_state(&mdev->receiver) == Exiting) + return -1; + } + } + + connected: + + msock->sk->SK_(reuse)=1; /* SO_REUSEADDR */ + sock->sk->SK_(reuse)=1; /* SO_REUSEADDR */ + + sock->sk->SK_(allocation) = GFP_NOIO; + msock->sk->SK_(allocation) = GFP_NOIO; + + sock->sk->SK_(priority)=TC_PRIO_BULK; + NOT_IN_26(sock->sk->tp_pinfo.af_tcp.nonagle=0;) + ONLY_IN_26( tcp_sk(sock->sk)->nonagle = 0;) + // FIXME fold to limits. should be done in drbd_ioctl + sock->sk->SK_(sndbuf) = mdev->conf.sndbuf_size; + sock->sk->SK_(rcvbuf) = mdev->conf.sndbuf_size; + /* NOT YET ... + * sock->sk->SK_(sndtimeo) = mdev->conf.timeout*HZ/20; + * sock->sk->SK_(rcvtimeo) = MAX_SCHEDULE_TIMEOUT; + * THINK HandShake timeout, hardcoded for now: */ + sock->sk->SK_(sndtimeo) = + sock->sk->SK_(rcvtimeo) = 2*HZ; + sock->sk->SK_(userlocks) |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK; + + msock->sk->SK_(priority)=TC_PRIO_INTERACTIVE; + NOT_IN_26(sock->sk->tp_pinfo.af_tcp.nonagle=1;) + ONLY_IN_26(tcp_sk(sock->sk)->nonagle = 1;) + msock->sk->SK_(sndbuf) = 2*32767; + msock->sk->SK_(sndtimeo) = mdev->conf.timeout*HZ/20; + msock->sk->SK_(rcvtimeo) = mdev->conf.ping_int*HZ; + + mdev->data.socket = sock; + mdev->meta.socket = msock; + mdev->last_received = jiffies; + + set_cstate(mdev,WFReportParams); + D_ASSERT(mdev->asender.task == NULL); + + h = drbd_do_handshake(mdev); + if (h <= 0) return h; + + clear_bit(ON_PRI_INC_HUMAN,&mdev->flags); + clear_bit(ON_PRI_INC_TIMEOUTEX,&mdev->flags); + + sock->sk->SK_(sndtimeo) = mdev->conf.timeout*HZ/20; + sock->sk->SK_(rcvtimeo) = MAX_SCHEDULE_TIMEOUT; + + drbd_thread_start(&mdev->asender); + + drbd_send_param(mdev,0); + clear_bit(USE_DEGR_WFC_T,&mdev->flags); + + return 1; +} + +STATIC int drbd_recv_header(drbd_dev *mdev, Drbd_Header *h) +{ + int r; + + r = drbd_recv(mdev,h,sizeof(*h)); + + if (unlikely( r != sizeof(*h) )) { + ERR("short read expecting header on sock: r=%d\n",r); + return FALSE; + }; + h->command = be16_to_cpu(h->command); + h->length = be16_to_cpu(h->length); + if (unlikely( h->magic != BE_DRBD_MAGIC )) { + ERR("magic?? m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + return FALSE; + } + mdev->last_received = jiffies; + + return TRUE; +} + +STATIC int receive_Barrier(drbd_dev *mdev, Drbd_Header* h) +{ + int rv; + int epoch_size; + Drbd_Barrier_Packet *p = (Drbd_Barrier_Packet*)h; + + ERR_IF(mdev->state != Secondary) return FALSE; + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + rv = drbd_recv(mdev, h->payload, h->length); + ERR_IF(rv != h->length) return FALSE; + + inc_unacked(mdev); + + // DBG("got Barrier\n"); + + if (mdev->conf.wire_protocol != DRBD_PROT_C) + drbd_kick_lo(mdev); + + drbd_wait_ee(mdev,&mdev->active_ee); + + spin_lock_irq(&mdev->ee_lock); + rv = _drbd_process_ee(mdev,1); + + epoch_size=atomic_read(&mdev->epoch_size); + atomic_set(&mdev->epoch_size,0); + spin_unlock_irq(&mdev->ee_lock); + + rv &= drbd_send_b_ack(mdev, p->barrier, epoch_size); + dec_unacked(mdev); + + return rv; +} + +STATIC struct Tl_epoch_entry * +read_in_block(drbd_dev *mdev, int data_size) +{ + struct Tl_epoch_entry *e; + drbd_bio_t *bio; + int rr; + + spin_lock_irq(&mdev->ee_lock); + e=drbd_get_ee(mdev); + spin_unlock_irq(&mdev->ee_lock); + if(!e) return 0; + + bio = &e->private_bio; + + rr=drbd_recv(mdev, drbd_bio_kmap(bio), data_size); + drbd_bio_kunmap(bio); + + if ( rr != data_size) { + NOT_IN_26(clear_bit(BH_Lock, &bio->b_state);) + spin_lock_irq(&mdev->ee_lock); + drbd_put_ee(mdev,e); + spin_unlock_irq(&mdev->ee_lock); + WARN("short read receiving data block: read %d expected %d\n", + rr, data_size); + return 0; + } + mdev->recv_cnt+=data_size>>9; + + return e; +} + +STATIC void receive_data_tail(drbd_dev *mdev,int data_size) +{ + /* kick lower level device, if we have more than (arbitrary number) + * reference counts on it, which typically are locally submitted io + * requests. don't use unacked_cnt, so we speed up proto A and B, too. + * + * XXX maybe: make that arbitrary number configurable. + * for now, I choose 1/16 of max-epoch-size. + */ + if (atomic_read(&mdev->local_cnt) >= (mdev->conf.max_epoch_size>>4) ) { + drbd_kick_lo(mdev); + } + mdev->writ_cnt+=data_size>>9; +} + +STATIC int recv_dless_read(drbd_dev *mdev, drbd_request_t *req, + sector_t sector, int data_size) +{ + drbd_bio_t *bio; + int ok,rr; + + bio = req->master_bio; + + D_ASSERT( sector == drbd_req_get_sector(req) ); + + rr=drbd_recv(mdev,drbd_bio_kmap(bio),data_size); + drbd_bio_kunmap(bio); + + ok=(rr==data_size); + drbd_bio_endio(bio,ok); + dec_ap_bio(mdev); + + dec_ap_pending(mdev); + return ok; +} + +STATIC int e_end_resync_block(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + sector_t sector = drbd_ee_get_sector(e); + int ok; + + drbd_rs_complete_io(mdev,sector); // before set_in_sync() ! + if (likely( drbd_bio_uptodate(&e->private_bio) )) { + ok = !test_bit(DISKLESS,&mdev->flags) && + !test_bit(PARTNER_DISKLESS,&mdev->flags); + if (likely( ok )) { + drbd_set_in_sync(mdev, sector, drbd_ee_get_size(e)); + /* THINK maybe don't send ack either + * when we are suddenly diskless? + * Dropping it here should do no harm, + * since peer has no structs referencing this. + */ + } + ok = drbd_send_ack(mdev,WriteAck,e); + set_bit(SYNC_STARTED,&mdev->flags); + } else { + ok = drbd_send_ack(mdev,NegAck,e); + ok&= drbd_io_error(mdev); + } + dec_unacked(mdev); + + return ok; +} + +STATIC int recv_resync_read(drbd_dev *mdev,sector_t sector, int data_size) +{ + struct Tl_epoch_entry *e; + + e = read_in_block(mdev,data_size); + if(!e) return FALSE; + + dec_rs_pending(mdev); + + e->block_id = ID_SYNCER; + if(!inc_local(mdev)) { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not write resync data to local disk.\n"); + drbd_send_ack(mdev,NegAck,e); + spin_lock_irq(&mdev->ee_lock); + drbd_put_ee(mdev,e); + spin_unlock_irq(&mdev->ee_lock); + return TRUE; + } + + drbd_ee_prepare_write(mdev,e,sector,data_size); + e->w.cb = e_end_resync_block; + + spin_lock_irq(&mdev->ee_lock); + list_add(&e->w.list,&mdev->sync_ee); + spin_unlock_irq(&mdev->ee_lock); + + inc_unacked(mdev); + + drbd_generic_make_request(WRITE,&e->private_bio); + + receive_data_tail(mdev,data_size); + return TRUE; +} + +STATIC int receive_DataReply(drbd_dev *mdev,Drbd_Header* h) +{ + drbd_request_t *req; + sector_t sector; + unsigned int header_size,data_size; + int ok; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + /* I expect a block to be a multiple of 512 byte, and + * no more than 4K (PAGE_SIZE). is this too restrictive? + */ + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > PAGE_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + + req = (drbd_request_t *)(long)p->block_id; + D_ASSERT(req->w.cb == w_is_app_read); + + spin_lock(&mdev->pr_lock); + list_del(&req->w.list); + spin_unlock(&mdev->pr_lock); + + ok = recv_dless_read(mdev,req,sector,data_size); + + INVALIDATE_MAGIC(req); + mempool_free(req,drbd_request_mempool); + + return ok; +} + +STATIC int receive_RSDataReply(drbd_dev *mdev,Drbd_Header* h) +{ + sector_t sector; + unsigned int header_size,data_size; + int ok; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + /* I expect a block to be a multiple of 512 byte, and + * no more than 4K (PAGE_SIZE). is this too restrictive? + */ + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > PAGE_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + D_ASSERT(p->block_id == ID_SYNCER); + + ok = recv_resync_read(mdev,sector,data_size); + + return ok; +} + +STATIC int e_end_block(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + sector_t sector = drbd_ee_get_sector(e); + int ok=1; + + atomic_inc(&mdev->epoch_size); + if(mdev->conf.wire_protocol == DRBD_PROT_C) { + if(likely(drbd_bio_uptodate(&e->private_bio))) { + ok=drbd_send_ack(mdev,WriteAck,e); + if (ok && test_bit(SYNC_STARTED,&mdev->flags) ) + drbd_set_in_sync(mdev,sector,drbd_ee_get_size(e)); + } else { + ok = drbd_send_ack(mdev,NegAck,e); + ok&= drbd_io_error(mdev); + /* we expect it to be marked out of sync anyways... + * maybe assert this? + */ + } + dec_unacked(mdev); + + return ok; + } + + if(unlikely(!drbd_bio_uptodate(&e->private_bio))) { + ok = drbd_io_error(mdev); + } + + return ok; +} + +// mirrored write +STATIC int receive_Data(drbd_dev *mdev,Drbd_Header* h) +{ + sector_t sector; + struct Tl_epoch_entry *e; + Drbd_Data_Packet *p = (Drbd_Data_Packet*)h; + int header_size,data_size; + + // FIXME merge this code dups into some helper function + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + /* I expect a block to be a multiple of 512 byte, and + * no more than 4K (PAGE_SIZE). is this too restrictive? + */ + ERR_IF(data_size == 0) return FALSE; + ERR_IF(data_size & 0x1ff) return FALSE; + ERR_IF(data_size > PAGE_SIZE) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + + e = read_in_block(mdev,data_size); + if (!e) return FALSE; + e->block_id = p->block_id; // no meaning on this side, e* on partner + + if(!inc_local(mdev)) { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not write mirrored data block to local disk.\n"); + drbd_send_ack(mdev,NegAck,e); + spin_lock_irq(&mdev->ee_lock); + drbd_put_ee(mdev,e); + spin_unlock_irq(&mdev->ee_lock); + return TRUE; + } + + drbd_ee_prepare_write(mdev, e, sector, data_size); + e->w.cb = e_end_block; + + spin_lock_irq(&mdev->ee_lock); + list_add(&e->w.list,&mdev->active_ee); + spin_unlock_irq(&mdev->ee_lock); + + switch(mdev->conf.wire_protocol) { + case DRBD_PROT_C: + inc_unacked(mdev); + break; + case DRBD_PROT_B: + drbd_send_ack(mdev, RecvAck, e); + break; + case DRBD_PROT_A: + // nothing to do + break; + } + + drbd_generic_make_request(WRITE,&e->private_bio); + + receive_data_tail(mdev,data_size); + return TRUE; +} + +STATIC int receive_DataRequest(drbd_dev *mdev,Drbd_Header *h) +{ + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct Tl_epoch_entry *e; + int size; + Drbd_BlockRequest_Packet *p = (Drbd_BlockRequest_Packet*)h; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + /* + * handled by NegDReply below ... + ERR_IF (test_bit(DISKLESS,&mdev->flags)) { + return FALSE; + ERR_IF ( (mdev->gen_cnt[Flags] & MDF_Consistent) == 0 ) + return FALSE; + */ + + if (size <= 0 || (size & 0x1ff) != 0 || size > PAGE_SIZE) { + ERR("%s:%d: sector: %lu, size: %d\n", __FILE__, __LINE__, + (unsigned long)sector,size); + return FALSE; + } + if ( sector + (size>>9) > capacity) { + ERR("%s:%d: sector: %lu, size: %d\n", __FILE__, __LINE__, + (unsigned long)sector,size); + return FALSE; + } + + spin_lock_irq(&mdev->ee_lock); + e=drbd_get_ee(mdev); + if(!e) { + spin_unlock_irq(&mdev->ee_lock); + return FALSE; + } + e->block_id = p->block_id; // no meaning on this side, pr* on partner + list_add(&e->w.list,&mdev->read_ee); + spin_unlock_irq(&mdev->ee_lock); + + if(!inc_local(mdev) || (mdev->gen_cnt[Flags] & MDF_Consistent) == 0) { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Can not satisfy peer's read request, no local data.\n"); + drbd_send_ack(mdev,NegDReply,e); + spin_lock_irq(&mdev->ee_lock); + drbd_put_ee(mdev,e); + spin_unlock_irq(&mdev->ee_lock); + return TRUE; + } + + drbd_ee_prepare_read(mdev,e,sector,size); + + switch (h->command) { + case DataRequest: + e->w.cb = w_e_end_data_req; + break; + case RSDataRequest: + e->w.cb = w_e_end_rsdata_req; + /* Eventually this should become asynchrously. Currently it + * blocks the whole receiver just to delay the reading of a + * resync data block. + * the drbd_work_queue mechanism is made for this... + */ + if (!drbd_rs_begin_io(mdev,sector)) { + // we have been interrupted, probably connection lost! + D_ASSERT(signal_pending(current)); + drbd_put_ee(mdev,e); + return 0; + } + break; + default: + ERR("unexpected command (%s) in receive_DataRequest\n", + cmdname(h->command)); + } + + mdev->read_cnt += size >> 9; + inc_unacked(mdev); + drbd_generic_make_request(READ,&e->private_bio); + if (atomic_read(&mdev->local_cnt) >= (mdev->conf.max_epoch_size>>4) ) { + drbd_kick_lo(mdev); + } + + + return TRUE; +} + +STATIC int receive_SyncParam(drbd_dev *mdev,Drbd_Header *h) +{ + int ok = TRUE; + Drbd_SyncParam_Packet *p = (Drbd_SyncParam_Packet*)h; + + // FIXME move into helper + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + // XXX harmless race with ioctl ... + mdev->sync_conf.rate = be32_to_cpu(p->rate); + mdev->sync_conf.use_csums = be32_to_cpu(p->use_csums); + mdev->sync_conf.skip = be32_to_cpu(p->skip); + drbd_alter_sg(mdev, be32_to_cpu(p->group)); + + if ( (mdev->cstate == SkippedSyncS || mdev->cstate == SkippedSyncT) + && !mdev->sync_conf.skip ) + { + set_cstate(mdev,WFReportParams); + ok = drbd_send_param(mdev,0); + } + + return ok; +} + +STATIC int drbd_sync_handshake(drbd_dev *mdev, Drbd_Parameter_Packet *p) +{ + int have_good,sync; + + have_good = drbd_md_compare(mdev,p); + + if(have_good==0) { + if (drbd_md_test_flag(mdev,MDF_PrimaryInd)) { + /* gen counts compare the same, but I have the + * PrimaryIndicator set. so the peer has, too + * (otherwise this would not compare the same). + * so we had a split brain! + * + * FIXME maybe log MDF_SplitBran into metadata, + * and refuse to do anything until told otherwise! + * + * for now: just go StandAlone. + */ + ALERT("Split-Brain detected, dropping connection!\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + sync=0; + } else { + sync=1; + } + + drbd_dump_md(mdev,p,0); + // INFO("have_good=%d sync=%d\n", have_good, sync); + + if (have_good > 0 && !drbd_md_test_flag(mdev,MDF_Consistent)) { + /* doh. I cannot become SyncSource when I am inconsistent! + */ + ERR("I shall become SyncSource, but I am inconsistent!\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + if (have_good < 0 && + !(be32_to_cpu(p->gen_cnt[Flags]) & MDF_Consistent) ) { + /* doh. Peer cannot become SyncSource when inconsistent + */ + ERR("I shall become SyncTarget, but Peer is inconsistent!\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + if ( mdev->sync_conf.skip && sync ) { + if (have_good == 1) + set_cstate(mdev,SkippedSyncS); + else // have_good == -1 + set_cstate(mdev,SkippedSyncT); + return TRUE; + } + + if( sync ) { + if(have_good == 1) { + D_ASSERT(drbd_md_test_flag(mdev,MDF_Consistent)); + set_cstate(mdev,WFBitMapS); + wait_event(mdev->cstate_wait, + atomic_read(&mdev->ap_bio_cnt)==0); + drbd_bm_lock(mdev); // { + drbd_send_bitmap(mdev); + drbd_bm_unlock(mdev); // } + } else { // have_good == -1 + if ( (mdev->state == Primary) && + drbd_md_test_flag(mdev,MDF_Consistent) ) { + /* FIXME + * allow Primary become SyncTarget if it was + * diskless, and now had a storage reattached. + * only somewhere the MDF_Consistent flag is + * set where it should not... I think. + */ + ERR("Current Primary shall become sync TARGET!" + " Aborting to prevent data corruption.\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + drbd_md_clear_flag(mdev,MDF_Consistent); + set_cstate(mdev,WFBitMapT); + } + } else { + set_cstate(mdev,Connected); + drbd_bm_lock(mdev); // { + if(drbd_bm_total_weight(mdev)) { + if (drbd_md_test_flag(mdev,MDF_Consistent)) { + /* We are not going to do a resync but there + are marks in the bitmap. + (Could be from the AL, or someone used + the write_gc.pl program) + Clean the bitmap... + */ + INFO("No resync -> clearing bit map.\n"); + drbd_bm_clear_all(mdev); + drbd_bm_write(mdev); + } else { + WARN("I am inconsistent, but there is no sync? BOTH nodes inconsistent!\n"); + } + } + drbd_bm_unlock(mdev); // } + } + + if (have_good == -1) { + /* Sync-Target has to adopt source's gen_cnt. */ + int i; + for(i=HumanCnt;igen_cnt[i]=be32_to_cpu(p->gen_cnt[i]); + } + } + return TRUE; +} + +STATIC int receive_param(drbd_dev *mdev, Drbd_Header *h) +{ + Drbd_Parameter_Packet *p = (Drbd_Parameter_Packet*)h; + int consider_sync; + int oo_state,i; + sector_t p_size, p_usize, my_usize; + + if (h->length != (sizeof(*p)-sizeof(*h))) { + ERR("Incompatible packet size of Parameter packet!\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + if (p->magic != BE_DRBD_MAGIC) { + ERR("invalid Parameter_Packet magic! Protocol version: me %d, peer %d\n", + PRO_VERSION, be32_to_cpu(p->version)); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + if(be32_to_cpu(p->version)!=PRO_VERSION) { + ERR("incompatible releases! Protocol version: me %d, peer %d\n", + PRO_VERSION, be32_to_cpu(p->version)); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + oo_state = be32_to_cpu(p->state); + if (oo_state != Primary && oo_state != Secondary) { + ERR("unexpected peer state: 0x%x\n", oo_state); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + if(be32_to_cpu(p->state) == Primary && mdev->state == Primary ) { + ERR("incompatible states (both Primary!)\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + if(be32_to_cpu(p->protocol)!=mdev->conf.wire_protocol) { + int peer_proto = be32_to_cpu(p->protocol); + if (DRBD_PROT_A <= peer_proto && peer_proto <= DRBD_PROT_C) { + ERR("incompatible communication protocols: " + "me %c, peer %c\n", + 'A'-1+mdev->conf.wire_protocol, + 'A'-1+peer_proto); + } else { + ERR("incompatible communication protocols: " + "me %c, peer [%d]\n", + 'A'-1+mdev->conf.wire_protocol, + peer_proto); + } + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + p_size=be64_to_cpu(p->p_size); + + if(p_size == 0 && test_bit(DISKLESS,&mdev->flags)) { + /* FIXME maybe allow connection, + * but refuse to become primary? */ + ERR("some backing storage is needed\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + return FALSE; + } + + drbd_bm_lock(mdev); // { + mdev->p_size=p_size; + + set_bit(MD_DIRTY,&mdev->flags); // we are changing state! + + p_usize=be64_to_cpu(p->u_size); + /* + * you may get a flip-flop connection established/connection loss, in + * case both really have different usize uppon first connect! + * try to solve it thus: + ***/ +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + if (mdev->cstate == WFReportParams) { + /* this is first connect, or an otherwise expected param + * exchange. choose the minimum */ + p_usize = min_not_zero(mdev->lo_usize, p_usize); + } else { + /* this was an "unexpected" param packet, + * just do what the peer suggests */ + } +#undef min_not_zero + + my_usize = mdev->lo_usize; + + if( mdev->lo_usize > p_usize ) { + mdev->lo_usize = p_usize; + INFO("Peer sets u_size to %lu KB\n", + (unsigned long)mdev->lo_usize); + } + + if( drbd_new_dev_size(mdev) < + (drbd_get_capacity(mdev->this_bdev)>>1) && + mdev->gen_cnt[Flags] & MDF_Consistent ) { + ERR("The peer's disk size is too small!\n"); + set_cstate(mdev,StandAlone); + drbd_thread_stop_nowait(&mdev->receiver); + mdev->lo_usize = my_usize; + return FALSE; + } + + consider_sync = (mdev->cstate == WFReportParams); + drbd_determin_dev_size(mdev); + if(drbd_disk_less_node_present(mdev)) consider_sync=0; + if(test_bit(DISKLESS, &mdev->flags)) consider_sync=0; + + drbd_bm_unlock(mdev); // } + + if(be32_to_cpu(p->flags)&1) { + consider_sync=1; + drbd_send_param(mdev,2); + } + if(be32_to_cpu(p->flags)&2) consider_sync=1; + + // XXX harmless race with ioctl ... + mdev->sync_conf.rate = + max_t(int,mdev->sync_conf.rate, be32_to_cpu(p->sync_rate)); + + // if one of them wants to skip, both of them should skip. + mdev->sync_conf.skip = + mdev->sync_conf.skip != 0 || p->skip_sync != 0; + mdev->sync_conf.group = + min_t(int,mdev->sync_conf.group,be32_to_cpu(p->sync_group)); + + if(!p_size) { + /* no point in trying to sync a diskless peer: */ + consider_sync = 0; + if (!test_and_set_bit(PARTNER_DISKLESS, &mdev->flags)) { + /* if we got here, we *do* have a disk. + * but it may be inconsistent... + * anyways, record that next time we need a full sync. + */ + clear_bit(PARTNER_CONSISTENT, &mdev->flags); + drbd_md_set_flag(mdev,MDF_FullSync); + drbd_md_write(mdev); + /* actually we'd need to bm_fill_bm(,-1); drbd_write_bm(mdev); + * but this is not necessary _now_. + * we have the MDF_FullSync bit on disk. + * on the next _drbd_send_bitmap this will be done. + */ + WARN("PARTNER DISKLESS\n"); + mdev->rs_total = 0; + } + if(mdev->cstate >= Connected ) { + if(mdev->state == Primary) tl_clear(mdev); + if(mdev->state == Primary || + be32_to_cpu(p->state) == Primary ) { + drbd_md_inc(mdev,ConnectedCnt); + } + } + if(mdev->cstate > Connected ) { + WARN("Resync aborted.\n"); + set_cstate(mdev,Connected); + } + } else { + if (test_and_clear_bit(PARTNER_DISKLESS, &mdev->flags)) { + WARN("Partner no longer diskless\n"); + D_ASSERT(consider_sync); + } + } + + if (be32_to_cpu(p->gen_cnt[Flags]) & MDF_Consistent) { + set_bit(PARTNER_CONSISTENT, &mdev->flags); + } else { + clear_bit(PARTNER_CONSISTENT, &mdev->flags); + } + + if (mdev->cstate == WFReportParams) { + INFO("Connection established.\n"); + } + + if (consider_sync) { + if (!drbd_sync_handshake(mdev,p)) return FALSE; + } + + if (mdev->cstate == WFReportParams) set_cstate(mdev,Connected); + + oo_state = mdev->o_state; + mdev->o_state = be32_to_cpu(p->state); + if(oo_state == Secondary && mdev->o_state == Primary) { + /* Secondary has to adopt primary's gen_cnt. */ + for(i=HumanCnt;igen_cnt[i]=be32_to_cpu(p->gen_cnt[i]); + } + } + + if (oo_state != mdev->o_state) { + INFO( "%s/%s --> %s/%s\n", + nodestate_to_name(mdev->state), + nodestate_to_name(oo_state), + nodestate_to_name(mdev->state), + nodestate_to_name(mdev->o_state) ); + /* FIXME assertion for (gencounts do not diverge) */ + } + drbd_md_write(mdev); // update connected indicator, la_size, ... + + return TRUE; +} + +/* Since we are processing the bitfild from lower addresses to higher, + it does not matter if the process it in 32 bit chunks or 64 bit + chunks as long as it is little endian. (Understand it as byte stream, + beginning with the lowest byte...) If we would use big endian + we would need to process it from the highest address to the lowest, + in order to be agnostic to the 32 vs 64 bits issue. + + returns 0 on failure, 1 if we suceessfully received it. */ +STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h) +{ + size_t bm_words, bm_i, want, num_words; + unsigned long *buffer; + int ok=FALSE; + + drbd_bm_lock(mdev); // { + + bm_words = drbd_bm_words(mdev); + bm_i = 0; + buffer = vmalloc(BM_PACKET_WORDS*sizeof(long)); + + while (1) { + num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i ); + want = num_words * sizeof(long); + ERR_IF(want != h->length) goto out; + if (want==0) break; + if (drbd_recv(mdev, buffer, want) != want) + goto out; + + drbd_bm_merge_lel(mdev, bm_i, num_words, buffer); + bm_i += num_words; + + if (!drbd_recv_header(mdev,h)) + goto out; + D_ASSERT(h->command == ReportBitMap); + } + + if (mdev->cstate == WFBitMapS) { + drbd_start_resync(mdev,SyncSource); + } else if (mdev->cstate == WFBitMapT) { + ok = drbd_send_bitmap(mdev); + if (!ok) goto out; + drbd_start_resync(mdev,SyncTarget); // XXX cannot fail ??? + } else { + ERR("unexpected cstate (%s) in receive_bitmap\n", + cstate_to_name(mdev->cstate)); + } + + // We just started resync. Now we can be sure that local disk IO is okay. + + /* no, actually we can't. failures happen asynchronously, anytime. + * we can never be sure. disk may have failed while we where busy shaking hands... + */ +/* + * FIXME this should only be D_ASSERT here. + * *doing* it here masks a logic bug elsewhere, I think. + */ + D_ASSERT(!test_bit(PARTNER_DISKLESS,&mdev->flags)); + D_ASSERT(!test_bit(DISKLESS,&mdev->flags)); +// EXPLAIN: + clear_bit(MD_IO_ALLOWED,&mdev->flags); + + ok=TRUE; + out: + drbd_bm_unlock(mdev); // } + vfree(buffer); + return ok; +} + +STATIC void drbd_fail_pending_reads(drbd_dev *mdev) +{ + struct list_head *le; + drbd_bio_t *bio; + LIST_HEAD(workset); + + /* + * Application READ requests + */ + spin_lock(&mdev->pr_lock); + list_splice_init(&mdev->app_reads,&workset); + spin_unlock(&mdev->pr_lock); + + while(!list_empty(&workset)) { + drbd_request_t *req; + le = workset.next; + req = list_entry(le, drbd_request_t, w.list); + list_del(le); + + bio = req->master_bio; + + drbd_bio_IO_error(bio); + dec_ap_bio(mdev); + dec_ap_pending(mdev); + + INVALIDATE_MAGIC(req); + mempool_free(req,drbd_request_mempool); + } +} + +STATIC int receive_skip(drbd_dev *mdev,Drbd_Header *h) +{ + // TODO zero copy sink :) + static char sink[128]; + int size,want,r; + + WARN("skipping unknown optional packet type %d, l: %d!\n", + h->command, h->length ); + + size = h->length; + while (size > 0) { + want = min_t(int,size,sizeof(sink)); + r = drbd_recv(mdev,sink,want); + ERR_IF(r < 0) break; + size -= r; + } + return (size == 0); +} + +STATIC int receive_BecomeSyncTarget(drbd_dev *mdev, Drbd_Header *h) +{ + ERR_IF(!mdev->bitmap) return FALSE; + ERR_IF(mdev->state != Secondary) + return FALSE; + ERR_IF(mdev->cstate != Connected) + return FALSE; + ERR_IF(test_bit(DISKLESS,&mdev->flags)) + return FALSE; + + drbd_bm_lock(mdev); + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + drbd_start_resync(mdev,SyncTarget); + drbd_bm_unlock(mdev); + return TRUE; +} + +STATIC int receive_BecomeSyncSource(drbd_dev *mdev, Drbd_Header *h) +{ + ERR_IF(mdev->cstate != Connected) + return FALSE; + ERR_IF(test_bit(DISKLESS,&mdev->flags)) + return FALSE; + ERR_IF(!drbd_md_test_flag(mdev,MDF_Consistent)) + return FALSE; + + drbd_bm_lock(mdev); + drbd_bm_set_all(mdev); + drbd_bm_write(mdev); + drbd_start_resync(mdev,SyncSource); + drbd_bm_unlock(mdev); + return TRUE; +} + +STATIC int receive_UnplugRemote(drbd_dev *mdev, Drbd_Header *h) +{ + if (!test_bit(DISKLESS,&mdev->flags)) drbd_kick_lo(mdev); + return TRUE; // cannot fail. +} + +typedef int (*drbd_cmd_handler_f)(drbd_dev*,Drbd_Header*); + +static drbd_cmd_handler_f drbd_default_handler[] = { + [Data] = receive_Data, + [DataReply] = receive_DataReply, + [RSDataReply] = receive_RSDataReply, + [RecvAck] = NULL, //receive_RecvAck, + [WriteAck] = NULL, //receive_WriteAck, + [Barrier] = receive_Barrier, + [BarrierAck] = NULL, //receive_BarrierAck, + [ReportParams] = receive_param, + [ReportBitMap] = receive_bitmap, + [Ping] = NULL, //receive_Ping, + [PingAck] = NULL, //receive_PingAck, + [BecomeSyncTarget] = receive_BecomeSyncTarget, + [BecomeSyncSource] = receive_BecomeSyncSource, + [UnplugRemote] = receive_UnplugRemote, + [DataRequest] = receive_DataRequest, + [RSDataRequest] = receive_DataRequest, //receive_RSDataRequest, + [SyncParam] = receive_SyncParam, +}; + +static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; +static drbd_cmd_handler_f *drbd_opt_cmd_handler = NULL; + +STATIC void drbdd(drbd_dev *mdev) +{ + drbd_cmd_handler_f handler; + Drbd_Header *header = &mdev->data.rbuf.head; + + for (;;) { + if (!drbd_recv_header(mdev,header)) + break; + + if (header->command < MAX_CMD) + handler = drbd_cmd_handler[header->command]; + else if (MayIgnore < header->command && header->command < MAX_OPT_CMD) + handler = drbd_opt_cmd_handler[header->command-MayIgnore]; + else if (header->command > MAX_OPT_CMD) + handler = receive_skip; + else + handler = NULL; + + if (unlikely(!handler)) { + ERR("unknown packet type %d, l: %d!\n", + header->command, header->length); + break; + } + if (mdev->cstate == WFReportParams && header->command != ReportParams) { + ERR("received %s packet while WFReportParams!?\n", + cmdname(header->command)); + } + if (unlikely(!handler(mdev,header))) { + ERR("error receiving %s, l: %d!\n", + cmdname(header->command), header->length); + break; + } + dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__); + } +} + +STATIC void drbd_disconnect(drbd_dev *mdev) +{ + D_ASSERT(mdev->cstate < Connected); + mdev->o_state = Unknown; + + /* in case we have been syncing, and then we drop the connection, + * we need to "w_resume_next_sg", which we try to achieve by + * setting the STOP_SYNC_TIMER bit, and schedulung the timer for + * immediate execution. + * unfortunately we cannot be sure that the timer already triggered. + * + * so we del_timer_sync here, and check that bit. + * if it is still set, we queue w_resume_next_sg anyways, + * just to be sure. + */ + + del_timer_sync(&mdev->resync_timer); + spin_lock_irq(&mdev->req_lock); + if (test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags)) { + mdev->resync_work.cb = w_resume_next_sg; + if (list_empty(&mdev->resync_work.list)) + _drbd_queue_work(&mdev->data.work,&mdev->resync_work); + // else: already queued, we only need to release the lock. + } else { + D_ASSERT(mdev->resync_work.cb == w_resync_inactive); + } + spin_unlock_irq(&mdev->req_lock); + + + drbd_thread_stop_nowait(&mdev->worker); + drbd_thread_stop(&mdev->asender); + + while(down_trylock(&mdev->data.mutex)) { + struct task_struct *task; + spin_lock(&mdev->send_task_lock); + if((task=mdev->send_task)) { + force_sig(DRBD_SIG, task); + spin_unlock(&mdev->send_task_lock); + down(&mdev->data.mutex); + break; + } else { + spin_unlock(&mdev->send_task_lock); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + } + } + /* By grabbing the sock_mutex we make sure that no one + uses the socket right now. */ + drbd_free_sock(mdev); + up(&mdev->data.mutex); + + drbd_fail_pending_reads(mdev); + drbd_thread_stop(&mdev->worker); + drbd_rs_cancel_all(mdev); + + // secondary + drbd_wait_ee(mdev,&mdev->active_ee); + drbd_wait_ee(mdev,&mdev->sync_ee); + drbd_clear_done_ee(mdev); + + // primary + tl_clear(mdev); + clear_bit(ISSUE_BARRIER,&mdev->flags); + wait_event( mdev->cstate_wait, atomic_read(&mdev->ap_pending_cnt)==0 ); + D_ASSERT(mdev->oldest_barrier->n_req == 0); + + // both + clear_bit(PARTNER_CONSISTENT, &mdev->flags); + clear_bit(PARTNER_DISKLESS,&mdev->flags); + + D_ASSERT(mdev->ee_in_use == 0); + D_ASSERT(list_empty(&mdev->read_ee)); // done by termination of worker + D_ASSERT(list_empty(&mdev->active_ee)); // done here + D_ASSERT(list_empty(&mdev->sync_ee)); // done here + D_ASSERT(list_empty(&mdev->done_ee)); // done here + + atomic_set(&mdev->epoch_size,0); + mdev->rs_total=0; + + if(atomic_read(&mdev->unacked_cnt)) { + ERR("unacked_cnt = %d\n",atomic_read(&mdev->unacked_cnt)); + atomic_set(&mdev->unacked_cnt,0); + } + + /* We do not have data structures that would allow us to + get the rs_pending_cnt down to 0 again. + * On SyncTarget we do not have any data structures describing + the pending RSDataRequest's we have sent. + * On SyncSource there is no data structure that tracks + the RSDataReply blocks that we sent to the SyncTarget. + And no, it is not the sum of the reference counts in the + resync_LRU. The resync_LRU tracks the whole operation including + the disk-IO, while the rs_pending_cnt only tracks the blocks + on the fly. */ + atomic_set(&mdev->rs_pending_cnt,0); + + if(atomic_read(&mdev->ap_pending_cnt)) { + ERR("ap_pending_cnt = %d\n",atomic_read(&mdev->ap_pending_cnt)); + atomic_set(&mdev->ap_pending_cnt,0); + } + + wake_up(&mdev->cstate_wait); + + if ( mdev->state == Primary && + ( test_bit(DISKLESS,&mdev->flags) + || !drbd_md_test_flag(mdev,MDF_Consistent) ) ) { + drbd_thread_stop_nowait(&mdev->receiver); + drbd_panic("Sorry, I have no access to good data anymore.\n"); + return; + } + + if (get_t_state(&mdev->receiver) == Exiting) { + if (test_bit(DISKLESS,&mdev->flags)) { + // Secondary + set_cstate(mdev,Unconfigured); + drbd_mdev_cleanup(mdev); + } else { + set_cstate(mdev,StandAlone); + drbd_thread_start(&mdev->worker); + } + } else { + set_cstate(mdev,Unconnected); + drbd_thread_start(&mdev->worker); + } + + if (mdev->state == Primary) { + if(!test_bit(DO_NOT_INC_CONCNT,&mdev->flags)) + drbd_md_inc(mdev,ConnectedCnt); + drbd_md_write(mdev); + } + clear_bit(DO_NOT_INC_CONCNT,&mdev->flags); + + /* it may still be set, because some unplug was on the fly */ + NOT_IN_26(mdev->flags &= ~(1<receiver ... + Drbd_HandShake_Packet *p = &mdev->data.sbuf.HandShake; + int ok; + + if (down_interruptible(&mdev->data.mutex)) { + ERR("interrupted during initial handshake\n"); + return 0; /* interrupted. not ok. */ + } + memset(p,0,sizeof(*p)); + p->protocol_version = cpu_to_be32(PRO_VERSION); + ok = _drbd_send_cmd( mdev, mdev->data.socket, HandShake, + (Drbd_Header *)p, sizeof(*p), 0 ); + up(&mdev->data.mutex); + return ok; +} + +/* + * return values: + * 1 yess, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +STATIC int drbd_do_handshake(drbd_dev *mdev) +{ + // ASSERT current == mdev->receiver ... + Drbd_HandShake_Packet *p = &mdev->data.rbuf.HandShake; + const int expect = sizeof(Drbd_HandShake_Packet)-sizeof(Drbd_Header); + int rv; + + rv = drbd_send_handshake(mdev); + if (!rv) goto break_c_loop; + + rv = drbd_recv_header(mdev,&p->head); + if (!rv) goto break_c_loop; + + if (p->head.command == ReportParams) { + ERR("expected HandShake packet, received ReportParams...\n"); + ERR("peer probaly runs some incompatible 0.7 -preX version\n"); + return -1; + } else if (p->head.command != HandShake) { + ERR( "expected HandShake packet, received: %s (0x%04x)\n", + cmdname(p->head.command), p->head.command ); + return -1; + } + + if (p->head.length != expect) { + ERR( "expected HandShake length: %u, received: %u\n", + expect, p->head.length ); + return -1; + } + + rv = drbd_recv(mdev, &p->head.payload, expect); + + if (rv != expect) { + ERR("short read receiving handshake packet: l=%u\n", rv); + return 0; + } + + dump_packet(mdev,mdev->data.socket,2,&mdev->data.rbuf, __FILE__, __LINE__); + + p->protocol_version = be32_to_cpu(p->protocol_version); + + if ( p->protocol_version == PRO_VERSION || + p->protocol_version == (PRO_VERSION+1) ) { + if (p->protocol_version == (PRO_VERSION+1)) { + WARN( "You should upgrade me! " + "Peer wants protocol version: %u\n", + p->protocol_version ); + } + INFO( "Handshake successful: DRBD Network Protocol version %u\n", + PRO_VERSION ); + } /* else if ( p->protocol_version == (PRO_VERSION-1) ) { + // not yet; but next time :) + INFO( "Handshake successful: DRBD Protocol version %u\n", + (PRO_VERSION-1) ); + ... do some remapping of defaults and jump tables here ... + } */ else { + ERR( "incompatible DRBD dialects: " + "I support %u, peer wants %u\n", + PRO_VERSION, p->protocol_version ); + return -1; + } + + return 1; + + break_c_loop: + WARN( "My msock connect got accepted onto peer's sock!\n"); + /* In case a tcp connection set-up takes longer than + connect-int, we might get into the situation that this + node's msock gets connected to the peer's sock! + + To break out of this endless loop behaviour, we need to + wait unti the peer's msock connect tries are over. (1 Second) + + Additionally we wait connect-int/2 to hit with our next + connect try exactly in the peer's window of expectation. */ + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ + (mdev->conf.try_connect_int*HZ)/2); + + return 0; +} + +int drbdd_init(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + int minor = (int)(mdev-drbd_conf); + int h; + + sprintf(current->comm, "drbd%d_receiver", minor); + + /* printk(KERN_INFO DEVICE_NAME ": receiver living/m=%d\n", minor); */ + + while (TRUE) { + h = drbd_connect(mdev); + if (h <= 0) { + /* FIXME DISKLESS StandAlone + * does not make much sense... + * drbd_disconnect should set cstate properly... + */ + drbd_disconnect(mdev); + if (h == 0) { + schedule_timeout(HZ); + continue; + } + + WARN("Discarding network configuration.\n"); + set_cstate(mdev,StandAlone); + break; + } + if (get_t_state(thi) == Exiting) break; + drbdd(mdev); + drbd_disconnect(mdev); + if (get_t_state(thi) == Exiting) break; + if(mdev->conf.on_disconnect == DropNetConf) { + set_cstate(mdev,StandAlone); + break; + } + else { + if (signal_pending(current)) { + drbd_flush_signals(current); + } + spin_lock(&thi->t_lock); + D_ASSERT(thi->t_state == Restarting); + thi->t_state = Running; + spin_unlock(&thi->t_lock); + } + } + + INFO("receiver terminated\n"); + + return 0; +} + +/* ********* acknowledge sender ******** */ + +STATIC int got_Ping(drbd_dev *mdev, Drbd_Header* h) +{ + return drbd_send_ping_ack(mdev); + +} + +STATIC int got_PingAck(drbd_dev *mdev, Drbd_Header* h) +{ + // restore idle timeout + mdev->meta.socket->sk->SK_(rcvtimeo) = mdev->conf.ping_int*HZ; + + return TRUE; +} + +STATIC int got_BlockAck(drbd_dev *mdev, Drbd_Header* h) +{ + drbd_request_t *req; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + + smp_rmb(); + if(likely(!test_bit(PARTNER_DISKLESS,&mdev->flags))) { + // test_bit(PARTNER_DISKLESS,&mdev->flags) + // This happens if one a few IO requests on the peer + // failed, and some subsequest completed sucessfull + // afterwards. + + // But we killed everything out of the transferlog + // as we got the news hat IO is broken on the peer. + + if( is_syncer_blk(mdev,p->block_id)) { + drbd_set_in_sync(mdev,sector,blksize); + set_bit(SYNC_STARTED,&mdev->flags); + } else { + req=(drbd_request_t*)(long)p->block_id; + + ERR_IF (!VALID_POINTER(req)) return FALSE; + + drbd_end_req(req, RQ_DRBD_SENT, 1, sector); + + if (test_bit(SYNC_STARTED,&mdev->flags) && + mdev->conf.wire_protocol == DRBD_PROT_C) + drbd_set_in_sync(mdev,sector,blksize); + } + } + + if(is_syncer_blk(mdev,p->block_id)) { + dec_rs_pending(mdev); + } else { + D_ASSERT(mdev->conf.wire_protocol != DRBD_PROT_A); + dec_ap_pending(mdev); + } + return TRUE; +} + +STATIC int got_NegAck(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; +#if 0 + sector_t sector = be64_to_cpu(p->sector); + int size = be32_to_cpu(p->blksize); +#endif + + /* do nothing here. + * we expect to get a "report param" on the data socket soon, + * and will do the cleanup then and there. + */ + if(is_syncer_blk(mdev,p->block_id)) { + dec_rs_pending(mdev); + } +#if 0 + else { + D_ASSERT(bm_get_bit(mdev->mbds_id,sector,size)); + // tl_clear() must have set this out of sync! + D_ASSERT(mdev->conf.wire_protocol != DRBD_PROT_A); + dec_ap_pending(mdev,HERE); + } +#endif + if (DRBD_ratelimit(5*HZ,5)) + WARN("Got NegAck packet. Peer is in troubles?\n"); + + return TRUE; +} + +STATIC int got_NegDReply(drbd_dev *mdev, Drbd_Header* h) +{ + /* drbd_request_t *req; + * unused now */ + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + + if (is_syncer_blk(mdev,p->block_id)) { + /* no resync data available. don't panic just yet ... */ + printk(KERN_EMERG DEVICE_NAME "%d: " + "Got NegDReply for resync request. " + "WE ARE LOST. We lost our up-to-date disk.\n", + (int)(mdev-drbd_conf)); + return FALSE; + } /* else { */ + +#if 0 + /* hey, we panic anyways. so why bother? */ + req = (drbd_request_t *)(long)p->block_id; + if (VALID_POINTER(req)) { + D_ASSERT(req->w.cb == w_is_app_read); + + spin_lock(&mdev->pr_lock); + list_del(&req->w.list); + spin_unlock(&mdev->pr_lock); + + INVALIDATE_MAGIC(req); + mempool_free(req,drbd_request_mempool); + } +#endif + + drbd_panic("Got NegDReply. WE ARE LOST. We lost our up-to-date disk.\n"); + + // THINK do we have other options, but panic? + // what about bio_endio, in case we don't panic ?? + + return FALSE; +} + +STATIC int got_NegRSDReply(drbd_dev *mdev, Drbd_Header* h) +{ + sector_t sector; + Drbd_BlockAck_Packet *p = (Drbd_BlockAck_Packet*)h; + + sector = be64_to_cpu(p->sector); + D_ASSERT(p->block_id == ID_SYNCER); + + drbd_rs_complete_io(mdev,sector); + + drbd_panic("Got NegRSDReply. WE ARE LOST. We lost our up-to-date disk.\n"); + + // THINK do we have other options, but panic? + // what about bio_endio, in case we don't panic ?? + + return TRUE; +} + +STATIC int got_BarrierAck(drbd_dev *mdev, Drbd_Header* h) +{ + Drbd_BarrierAck_Packet *p = (Drbd_BarrierAck_Packet*)h; + + smp_rmb(); + if(unlikely(test_bit(PARTNER_DISKLESS,&mdev->flags))) return TRUE; + + tl_release(mdev,p->barrier,be32_to_cpu(p->set_size)); + dec_ap_pending(mdev); + + return TRUE; +} + +struct asender_cmd { + size_t pkt_size; + int (*process)(drbd_dev *mdev, Drbd_Header* h); +}; + +int drbd_asender(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + Drbd_Header *h = &mdev->meta.rbuf.head; + + int rv,len; + void *buf = h; + int received = 0; + int expect = sizeof(Drbd_Header); + int cmd = -1; + + static struct asender_cmd asender_tbl[] = { + [Ping] ={ sizeof(Drbd_Header), got_Ping }, + [PingAck] ={ sizeof(Drbd_Header), got_PingAck }, + [RecvAck] ={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [WriteAck] ={ sizeof(Drbd_BlockAck_Packet), got_BlockAck }, + [NegAck] ={ sizeof(Drbd_BlockAck_Packet), got_NegAck }, + [NegDReply] ={ sizeof(Drbd_BlockAck_Packet), got_NegDReply }, + [NegRSDReply]={sizeof(Drbd_BlockAck_Packet), got_NegRSDReply}, + [BarrierAck]={ sizeof(Drbd_BarrierAck_Packet),got_BarrierAck }, + }; + + sprintf(current->comm, "drbd%d_asender", (int)(mdev-drbd_conf)); + + current->policy = SCHED_RR; /* Make this a realtime task! */ + current->rt_priority = 2; /* more important than all other tasks */ + + while (get_t_state(thi) == Running) { + if (test_and_clear_bit(SEND_PING, &mdev->flags)) { + ERR_IF(!drbd_send_ping(mdev)) goto err; + // half ack timeout only, + // since sendmsg waited the other half already + mdev->meta.socket->sk->SK_(rcvtimeo) = + mdev->conf.timeout*HZ/20; + } + + /* FIXME this *should* be below drbd_process_ee, + * but that leads to some distributed deadlock :-( + * this needs to be fixed properly, I'd vote for a separate + * msock sender thread, but others will frown upon yet an other + * kernel thread... + * -- lge + */ + set_bit(SIGNAL_ASENDER, &mdev->flags); + + if (!drbd_process_ee(mdev,0)) goto err; + + rv = drbd_recv_short(mdev,buf,expect-received); + clear_bit(SIGNAL_ASENDER, &mdev->flags); + + drbd_flush_signals(current); + + /* Note: + * -EINTR (on meta) we got a signal + * -EAGAIN (on meta) rcvtimeo expired + * -ECONNRESET other side closed the connection + * -ERESTARTSYS (on data) we got a signal + * rv < 0 other than above: unexpected error! + * rv == expected: full header or command + * rv < expected: "woken" by signal during receive + * rv == 0 : "connection shut down by peer" + */ + if (likely(rv > 0)) { + received += rv; + buf += rv; + } else if (rv == 0) { + ERR("meta connection shut down by peer.\n"); + goto err; + } else if (rv == -EAGAIN) { + if( mdev->meta.socket->sk->SK_(rcvtimeo) == + mdev->conf.timeout*HZ/20) { + ERR("PingAck did not arrive in time.\n"); + goto err; + } + set_bit(SEND_PING,&mdev->flags); + continue; + } else if (rv == -EINTR) { + continue; + } else { + ERR("sock_recvmsg returned %d\n", rv); + goto err; + } + + if (received == expect && cmd == -1 ) { + cmd = be16_to_cpu(h->command); + len = be16_to_cpu(h->length); + if (unlikely( h->magic != BE_DRBD_MAGIC )) { + ERR("magic?? m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + goto err; + } + expect = asender_tbl[cmd].pkt_size; + ERR_IF(len != expect-sizeof(Drbd_Header)) { + dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__); + DUMPI(expect); + } + } + if(received == expect) { + D_ASSERT(cmd != -1); + dump_packet(mdev,mdev->meta.socket,1,(void*)h, __FILE__, __LINE__); + if(!asender_tbl[cmd].process(mdev,h)) goto err; + + buf = h; + received = 0; + expect = sizeof(Drbd_Header); + cmd = -1; + } + } //while + + if(0) { + err: + clear_bit(SIGNAL_ASENDER, &mdev->flags); + if (mdev->cstate >= Connected) + set_cstate(mdev,NetworkFailure); + drbd_thread_restart_nowait(&mdev->receiver); + } + + INFO("asender terminated\n"); + + return 0; +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_req.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,422 @@ +/* +-*- linux-c -*- + drbd_req.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 1999-2004, Philipp Reisner . + main author. + + Copyright (C) 2002-2004, Lars Ellenberg . + main contributor. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include "drbd_int.h" + +void drbd_end_req(drbd_request_t *req, int nextstate, int er_flags, + sector_t rsector) +{ + /* This callback will be called in irq context by the IDE drivers, + and in Softirqs/Tasklets/BH context by the SCSI drivers. + This function is called by the receiver in kernel-thread context. + Try to get the locking right :) */ + + struct Drbd_Conf* mdev = drbd_req_get_mdev(req); + unsigned long flags=0; + int uptodate; + + PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev)); + PARANOIA_BUG_ON(drbd_req_get_sector(req) != rsector); + spin_lock_irqsave(&mdev->req_lock,flags); + + if(req->rq_status & nextstate) { + ERR("request state error(%d)\n", req->rq_status); + } + + req->rq_status |= nextstate; + req->rq_status &= er_flags | ~0x0001; + if( (req->rq_status & RQ_DRBD_DONE) == RQ_DRBD_DONE ) goto end_it; + + spin_unlock_irqrestore(&mdev->req_lock,flags); + + return; + +/* We only report uptodate == TRUE if both operations (WRITE && SEND) + reported uptodate == TRUE + */ + + end_it: + spin_unlock_irqrestore(&mdev->req_lock,flags); + + if( req->rq_status & RQ_DRBD_IN_TL ) { + if( ! ( er_flags & ERF_NOTLD ) ) { + /*If this call is from tl_clear() we may not call + tl_dependene, otherwhise we have a homegrown + spinlock deadlock. */ + if(tl_dependence(mdev,req)) + set_bit(ISSUE_BARRIER,&mdev->flags); + } else { + list_del(&req->w.list); // we have the tl_lock... + } + } + + uptodate = req->rq_status & 0x0001; + if( !uptodate && mdev->on_io_error == Detach) { + drbd_set_out_of_sync(mdev,rsector, drbd_req_get_size(req)); + // It should also be as out of sync on + // the other side! See w_io_error() + + drbd_bio_endio(req->master_bio,1); + dec_ap_bio(mdev); + // The assumption is that we wrote it on the peer. + +// FIXME proto A and diskless :) + + req->w.cb = w_io_error; + drbd_queue_work(mdev,&mdev->data.work,&req->w); + + goto out; + + } + + drbd_bio_endio(req->master_bio,uptodate); + dec_ap_bio(mdev); + + INVALIDATE_MAGIC(req); + mempool_free(req,drbd_request_mempool); + + out: + if (test_bit(ISSUE_BARRIER,&mdev->flags)) { + spin_lock_irqsave(&mdev->req_lock,flags); + if(list_empty(&mdev->barrier_work.list)) { + _drbd_queue_work(&mdev->data.work,&mdev->barrier_work); + } + spin_unlock_irqrestore(&mdev->req_lock,flags); + } +} + +int drbd_read_remote(drbd_dev *mdev, drbd_request_t *req) +{ + int rv; + drbd_bio_t *bio = req->master_bio; + + req->w.cb = w_is_app_read; + spin_lock(&mdev->pr_lock); + list_add(&req->w.list,&mdev->app_reads); + spin_unlock(&mdev->pr_lock); + set_bit(UNPLUG_REMOTE,&mdev->flags); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + rv=drbd_send_drequest(mdev, DataRequest, bio->b_rsector, bio->b_size, + (unsigned long)req); +#else + rv=drbd_send_drequest(mdev, DataRequest, bio->bi_sector, bio->bi_size, + (unsigned long)req); +#endif + return rv; +} + + +/* we may do a local read if: + * - we are consistent (of course), + * - or we are generally inconsistent, + * BUT we are still/already IN SYNC for this area. + * since size may be up to PAGE_SIZE, but BM_BLOCK_SIZE may be smaller + * than PAGE_SIZE, we may need to check several bits. + */ +STATIC int drbd_may_do_local_read(drbd_dev *mdev, sector_t sector, int size) +{ + unsigned long sbnr,ebnr,bnr; + sector_t esector, nr_sectors; + + if (drbd_md_test_flag(mdev,MDF_Consistent)) return 1; + + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size>>9) -1; + + D_ASSERT(sector < nr_sectors); + D_ASSERT(esector < nr_sectors); + + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + for (bnr = sbnr; bnr <= ebnr; bnr++) { + if (drbd_bm_test_bit(mdev,bnr)) return 0; + } + return 1; +} + +STATIC int +drbd_make_request_common(drbd_dev *mdev, int rw, int size, + sector_t sector, drbd_bio_t *bio) +{ + drbd_request_t *req; + int local, remote; + + if (unlikely(drbd_did_panic == DRBD_MAGIC)) { + drbd_bio_IO_error(bio); + return 0; + } + + /* + * If someone tries to mount on Secondary, and this is a 2.4 kernel, + * it would lead to a readonly mounted, but not cache-coherent, + * therefore dangerous, filesystem. + * On 2.6 this is prevented by bd_claiming the device. + * It is not that easy in 2.4. + * + * Because people continue to report they mount readonly, it does not + * do what they expect, and their logs fill with messages and stuff. + * + * Since it just won't work, we just fail IO here. + * [ ... until we implement some shared mode, and our users confirm by + * configuration, that they handle cache coherency themselves ... ] + */ + if (mdev->state != Primary && + ( !disable_bd_claim || rw == WRITE ) ) { + if (DRBD_ratelimit(5*HZ,5)) { + ERR("Not in Primary state, no %s requests allowed\n", + disable_bd_claim ? "WRITE" : "IO"); + } + drbd_bio_IO_error(bio); + return 0; + } + + /* + * Paranoia: we might have been primary, but sync target, or + * even diskless, then lost the connection. + * This should have been handled (panic? suspend?) somehwere + * else. But maybe it was not, so check again here. + * Caution: as long as we do not have a read/write lock on mdev, + * to serialize state changes, this is racy, since we may lose + * the connection *after* we test for the cstate. + */ + if ( ( test_bit(DISKLESS,&mdev->flags) + || !drbd_md_test_flag(mdev,MDF_Consistent) + ) && mdev->cstate < Connected ) + { + ERR("Sorry, I have no access to good data anymore.\n"); +/* + FIXME suspend, loop waiting on cstate wait? panic? +*/ + drbd_bio_IO_error(bio); + return 0; + } + + /* allocate outside of all locks + */ + req = mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (!req) { + /* only pass the error to the upper layers. + * if user cannot handle io errors, thats not our business. + */ + ERR("could not kmalloc() req\n"); + drbd_bio_IO_error(bio); + return 0; + } + SET_MAGIC(req); + req->master_bio = bio; + + // XXX maybe merge both variants into one + if (rw == WRITE) drbd_req_prepare_write(mdev,req); + else drbd_req_prepare_read(mdev,req); + + /* XXX req->w.cb = something; drbd_queue_work() .... + * Not yet. + */ + + // down_read(mdev->device_lock); + + wait_event( mdev->cstate_wait, + (volatile int)(mdev->cstate < WFBitMapS || + mdev->cstate > WFBitMapT) ); + + local = inc_local(mdev); + NOT_IN_26( if (rw == READA) rw=READ ); + if (rw == READ || rw == READA) { + if (local) { + if (!drbd_may_do_local_read(mdev,sector,size)) { + /* whe could kick the syncer to + * sync this extent asap, wait for + * it, then continue locally. + * Or just issue the request remotely. + */ + /* FIXME + * I think we have a RACE here. We request + * something from the peer, then later some + * write starts ... and finished *before* + * the answer to the read comes in, because + * the ACK for the WRITE goes over + * meta-socket ... + * Maybe we need to properly lock reads + * against the syncer, too. But if we have + * some user issuing writes on an area that + * he has pending reads on, _he_ is really + * broke anyways, and would get "undefined + * results" on _any_ io stack, even just the + * local io stack. + */ + local = 0; + dec_local(mdev); + } + } + remote = !local && test_bit(PARTNER_CONSISTENT, &mdev->flags); + } else { + remote = 1; + } + + /* If we have a disk, but a READA request is mapped to remote, + * we are Primary, Inconsistent, SyncTarget. + * Just fail that READA request right here. + * + * THINK: maybe fail all READA when not local? + * or make this configurable... + * if network is slow, READA won't do any good. + */ + if (rw == READA && !test_bit(DISKLESS,&mdev->flags) && !local) { + drbd_bio_IO_error(bio); + return 0; + } + + if (rw == WRITE && local) + drbd_al_begin_io(mdev, sector); + + remote = remote && (mdev->cstate >= Connected) + && !test_bit(PARTNER_DISKLESS,&mdev->flags); + + if (!(local || remote)) { + ERR("IO ERROR: neither local nor remote disk\n"); + // FIXME PANIC ?? + drbd_bio_IO_error(bio); + return 0; + } + + /* do this first, so I do not need to call drbd_end_req, + * but can set the rq_status directly. + */ + if (!local) + req->rq_status |= RQ_DRBD_LOCAL; + if (!remote) + req->rq_status |= RQ_DRBD_SENT; + + /* we need to plug ALWAYS since we possibly need to kick lo_dev */ + drbd_plug_device(mdev); + + inc_ap_bio(mdev); + if (remote) { + /* either WRITE and Connected, + * or READ, and no local disk, + * or READ, but not in sync. + */ + inc_ap_pending(mdev); + if (rw == WRITE) { + if (!drbd_send_dblock(mdev,req)) { + if (mdev->cstate >= Connected) + set_cstate(mdev,NetworkFailure); + dec_ap_pending(mdev); + drbd_thread_restart_nowait(&mdev->receiver); + } else if(mdev->conf.wire_protocol == DRBD_PROT_A) { + dec_ap_pending(mdev); + drbd_end_req(req, RQ_DRBD_SENT, 1, sector); + } + } else { + // this node is diskless ... + drbd_read_remote(mdev,req); + } + } + + if (local) { + if (rw == WRITE) { + if (!remote) drbd_set_out_of_sync(mdev,sector,size); + } else { + D_ASSERT(!remote); + } + /* FIXME + * Should we add even local reads to some list, so + * they can be grabbed and freed somewhen? + * + * They already have a reference count (sort of...) + * on mdev via inc_local() + */ + if(rw == WRITE) mdev->writ_cnt += size>>9; + else mdev->read_cnt += size>>9; + + // in 2.4.X, READA are submitted as READ. + drbd_generic_make_request(rw,drbd_req_private_bio(req)); + } + + // up_read(mdev->device_lock); + return 0; +} + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) +int drbd_make_request_24(request_queue_t *q, int rw, struct buffer_head *bh) +{ + struct Drbd_Conf* mdev = drbd_conf + MINOR(bh->b_rdev); + if (MINOR(bh->b_rdev) >= minor_count || mdev->cstate < StandAlone) { + buffer_IO_error(bh); + return 0; + } + + return drbd_make_request_common(mdev,rw,bh->b_size,bh->b_rsector,bh); +} +#else +int drbd_make_request_26(request_queue_t *q, struct bio *bio) +{ + unsigned int s_enr,e_enr; + struct Drbd_Conf* mdev = (drbd_dev*) q->queuedata; + if (mdev->cstate < StandAlone) { + drbd_bio_IO_error(bio); + return 0; + } + + /* + * what we "blindly" assume: + */ + D_ASSERT(bio->bi_size > 0); + D_ASSERT( (bio->bi_size & 0x1ff) == 0); + D_ASSERT(bio->bi_size <= PAGE_SIZE); + D_ASSERT(bio->bi_vcnt == 1); + D_ASSERT(bio->bi_idx == 0); + + s_enr = bio->bi_sector >> (AL_EXTENT_SIZE_B-9); + e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> (AL_EXTENT_SIZE_B-9); + D_ASSERT(e_enr >= s_enr); + + if(unlikely(s_enr != e_enr)) { + /* This bio crosses an AL_EXTENT boundary, so we have to + * split it. [So far, only XFS is known to do this...] + */ + struct bio_pair *bp; + bp = bio_split(bio, bio_split_pool, + (e_enr<<(AL_EXTENT_SIZE_B-9)) - bio->bi_sector); + drbd_make_request_26(q,&bp->bio1); + drbd_make_request_26(q,&bp->bio2); + bio_pair_release(bp); + return 0; + } + + return drbd_make_request_common(mdev,bio_rw(bio),bio->bi_size, + bio->bi_sector,bio); +} +#endif --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_sizeof_sanity_check.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,24 @@ +#include +#include + +#define SZO(type,size) \ + s = sizeof(type); \ + if (s != size) { \ + printk("<3>sizeof(" #type "): %d != %d\n", s, size); \ + err = -1; \ + } + +int sizeof_drbd_structs_sanity_check(void) +{ + int err = 0, s = 0; + SZO(struct disk_config, 24) + SZO(struct net_config, 304) + SZO(struct syncer_config, 24) + SZO(struct ioctl_disk_config, 32) + SZO(struct ioctl_net_config, 312) + SZO(struct ioctl_syncer_config, 32) + SZO(struct ioctl_wait, 16) + SZO(struct ioctl_get_config, 440) + if (err) printk("<3>ioctls won't work, aborting\n"); + return err; +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/drbd_worker.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,985 @@ +/* +-*- linux-c -*- + drbd_worker.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 2003-2004, Philipp Reisner . + Copyright (C) 2003-2004, Lars Ellenberg . + authors. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) || defined(HAVE_MM_INLINE_H) +#include // for the page_count macro on RH/Fedora +#endif +#include + +#include +#include "drbd_int.h" + +/* I choose to have all block layer end_io handlers defined here. + + * For all these callbacks, note the follwing: + * The callbacks will be called in irq context by the IDE drivers, + * and in Softirqs/Tasklets/BH context by the SCSI drivers. + * Try to get the locking right :) + * + */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) + +/* used for synchronous meta data and bitmap IO + * submitted by FIXME (I'd say worker only, but currently this is not true...) + */ +void drbd_md_io_complete(struct buffer_head *bh, int uptodate) +{ + if (uptodate) + set_bit(BH_Uptodate, &bh->b_state); + + complete((struct completion*)bh->b_private); +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +void enslaved_read_bi_end_io(drbd_bio_t *bh, int uptodate) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + struct Drbd_Conf* mdev; + + mdev=bh->b_private; + PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev)); + + e = container_of(bh,struct Tl_epoch_entry,private_bio); + PARANOIA_BUG_ON(!VALID_POINTER(e)); + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->ee_lock,flags); + + mark_buffer_uptodate(bh, uptodate); + clear_bit(BH_Lock, &bh->b_state); + smp_mb__after_clear_bit(); + + list_del(&e->w.list); + if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); + spin_unlock_irqrestore(&mdev->ee_lock,flags); + + drbd_chk_io_error(mdev,!uptodate); + drbd_queue_work(mdev,&mdev->data.work,&e->w); + dec_local(mdev); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +void drbd_dio_end_sec(struct buffer_head *bh, int uptodate) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + struct Drbd_Conf* mdev; + + mdev=bh->b_private; + PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev)); + + e = container_of(bh,struct Tl_epoch_entry,private_bio); + PARANOIA_BUG_ON(!VALID_POINTER(e)); + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->ee_lock,flags); + + mark_buffer_uptodate(bh, uptodate); + + clear_bit(BH_Dirty, &bh->b_state); + clear_bit(BH_Lock, &bh->b_state); + smp_mb__after_clear_bit(); + + list_del(&e->w.list); + list_add_tail(&e->w.list,&mdev->done_ee); + + if (waitqueue_active(&mdev->ee_wait) && + (list_empty(&mdev->active_ee) || + list_empty(&mdev->sync_ee))) + wake_up(&mdev->ee_wait); + + spin_unlock_irqrestore(&mdev->ee_lock,flags); + + drbd_chk_io_error(mdev,!uptodate); + wake_asender(mdev); + dec_local(mdev); +} + +/* writes on Primary comming from drbd_make_request + */ +void drbd_dio_end(struct buffer_head *bh, int uptodate) +{ + struct Drbd_Conf* mdev; + drbd_request_t *req; + + mdev = bh->b_private; + PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev)); + + req = container_of(bh,struct drbd_request,private_bio); + PARANOIA_BUG_ON(!VALID_POINTER(req)); + + drbd_chk_io_error(mdev,!uptodate); + drbd_end_req(req, RQ_DRBD_LOCAL, uptodate, drbd_req_get_sector(req)); + drbd_al_complete_io(mdev,drbd_req_get_sector(req)); + dec_local(mdev); +} + +/* reads on Primary comming from drbd_make_request + */ +void drbd_read_bi_end_io(struct buffer_head *bh, int uptodate) +{ + struct Drbd_Conf* mdev; + drbd_request_t *req; + + mdev = bh->b_private; + PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev)); + + req = container_of(bh,struct drbd_request,private_bio); + PARANOIA_BUG_ON(!VALID_POINTER(req)); + + // no special case for READA here, in 2.4.X we submit them as READ. + if (!uptodate) { + // for the panic: + drbd_chk_io_error(mdev,!uptodate); // handle panic and detach. + if(mdev->on_io_error == PassOn) goto pass_on; + // ok, if we survived this, retry: + // FIXME sector ... + if (DRBD_ratelimit(5*HZ,5)) + ERR("local read failed, retrying remotely\n"); + req->w.cb = w_read_retry_remote; + drbd_queue_work(mdev,&mdev->data.work,&req->w); + } else { + pass_on: + req->master_bio->b_end_io(req->master_bio,uptodate); + dec_ap_bio(mdev); + + INVALIDATE_MAGIC(req); + mempool_free(req,drbd_request_mempool); + } + dec_local(mdev); +} + +#else + +/* used for synchronous meta data and bitmap IO + * submitted by drbd_md_sync_page_io() + */ +int drbd_md_io_complete(struct bio *bio, unsigned int bytes_done, int error) +{ + if (bio->bi_size) + return 1; + + complete((struct completion*)bio->bi_private); + return 0; +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +int enslaved_read_bi_end_io(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + struct Drbd_Conf* mdev; + + mdev=bio->bi_private; + PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev)); + + /* we should be called via bio_endio, so this should never be the case + * but "everyone else does it", and so do we ;) -lge + */ + ERR_IF (bio->bi_size) + return 1; + + e = container_of(bio,struct Tl_epoch_entry,private_bio); + PARANOIA_BUG_ON(!VALID_POINTER(e)); + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->ee_lock,flags); + list_del(&e->w.list); + if(list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); + spin_unlock_irqrestore(&mdev->ee_lock,flags); + + drbd_chk_io_error(mdev,error); + drbd_queue_work(mdev,&mdev->data.work,&e->w); + dec_local(mdev); + return 0; +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +int drbd_dio_end_sec(struct bio *bio, unsigned int bytes_done, int error) +{ + unsigned long flags=0; + struct Tl_epoch_entry *e=NULL; + struct Drbd_Conf* mdev; + + mdev=bio->bi_private; + PARANOIA_BUG_ON(!IS_VALID_MDEV(mdev)); + + // see above + ERR_IF (bio->bi_size) + return 1; + + e = container_of(bio,struct Tl_epoch_entry,private_bio); + PARANOIA_BUG_ON(!VALID_POINTER(e)); + D_ASSERT(e->block_id != ID_VACANT); + + spin_lock_irqsave(&mdev->ee_lock,flags); + list_del(&e->w.list); + list_add_tail(&e->w.list,&mdev->done_ee); + + if (waitqueue_active(&mdev->ee_wait) && + (list_empty(&mdev->active_ee) || + list_empty(&mdev->sync_ee))) + wake_up(&mdev->ee_wait); + + spin_unlock_irqrestore(&mdev->ee_lock,flags); + + drbd_chk_io_error(mdev,error); + wake_asender(mdev); + dec_local(mdev); + return 0; +} + +/* writes on Primary comming from drbd_make_request + */ +int drbd_dio_end(struct bio *bio, unsigned int bytes_done, int error) +{ + drbd_request_t *req=bio->bi_private; + struct Drbd_Conf* mdev=req->mdev; + sector_t rsector; + + // see above + ERR_IF (bio->bi_size) + return 1; + + drbd_chk_io_error(mdev,error); + rsector = drbd_req_get_sector(req); + // the bi_sector of the bio gets modified somewhere in drbd_end_req()! + drbd_end_req(req, RQ_DRBD_LOCAL, (error == 0), rsector); + drbd_al_complete_io(mdev,rsector); + dec_local(mdev); + bio_put(bio); + return 0; +} + +/* reads on Primary comming from drbd_make_request + */ +int drbd_read_bi_end_io(struct bio *bio, unsigned int bytes_done, int error) +{ + drbd_request_t *req=bio->bi_private; + struct Drbd_Conf* mdev=req->mdev; + + // see above + ERR_IF (bio->bi_size) + return 1; + + /* READAs may fail. + * upper layers need to be able to handle that themselves */ + if (bio_rw(bio) == READA) goto pass_on; + if (error) { + drbd_chk_io_error(mdev,error); // handle panic and detach. + if(mdev->on_io_error == PassOn) goto pass_on; + // ok, if we survived this, retry: + // FIXME sector ... + if (DRBD_ratelimit(5*HZ,5)) + ERR("local read failed, retrying remotely\n"); + req->w.cb = w_read_retry_remote; + drbd_queue_work(mdev,&mdev->data.work,&req->w); + } else { + pass_on: + bio_endio(req->master_bio,req->master_bio->bi_size,error); + dec_ap_bio(mdev); + + INVALIDATE_MAGIC(req); + mempool_free(req,drbd_request_mempool); + } + + bio_put(bio); + dec_local(mdev); + return 0; +} +#endif + +int w_io_error(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + drbd_request_t *req = (drbd_request_t*)w; + int ok; + + /* FIXME send a "set_out_of_sync" packet to the peer + * in the PassOn case... + * in the Detach (or Panic) case, we (try to) send + * a "we are diskless" param packet anyways, and the peer + * will then set the FullSync bit in the meta data ... + */ + D_ASSERT(mdev->on_io_error != PassOn); + + INVALIDATE_MAGIC(req); + mempool_free(req,drbd_request_mempool); + + if(unlikely(cancel)) return 1; + + ok = drbd_io_error(mdev); + if(unlikely(!ok)) ERR("Sending in w_io_error() failed\n"); + return ok; +} + +int w_read_retry_remote(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + drbd_request_t *req = (drbd_request_t*)w; + int ok; + + smp_rmb(); + if ( cancel || + mdev->cstate < Connected || + !test_bit(PARTNER_CONSISTENT,&mdev->flags) ) { + drbd_panic("WE ARE LOST. Local IO failure, no peer.\n"); + + // does not make much sense, but anyways... + drbd_bio_endio(req->master_bio,0); + dec_ap_bio(mdev); + mempool_free(req,drbd_request_mempool); + return 1; + } + + // FIXME: what if partner was SyncTarget, and is out of sync for + // this area ?? ... should be handled in the receiver. + + ok = drbd_io_error(mdev); + if(unlikely(!ok)) ERR("Sending in w_read_retry_remote() failed\n"); + + inc_ap_pending(mdev); + ok = drbd_read_remote(mdev,req); + if(unlikely(!ok)) { + ERR("drbd_read_remote() failed\n"); + /* dec_ap_pending and bio_io_error are done in + * drbd_fail_pending_reads + */ + } + return ok; +} + +int w_resync_inactive(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + ERR_IF(cancel) return 1; + ERR("resync inactive, but callback triggered??\n"); + return 0; +} + +/* FIXME + * not used any longer, they now use e_end_resync_block. + * maybe remove again? + */ +int w_is_resync_read(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + ERR("%s: Typecheck only, should never be called!\n", __FUNCTION__ ); + return 0; +} + +/* in case we need it. currently unused, + * since should be assigned to "w_read_retry_remote" + */ +int w_is_app_read(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + ERR("%s: Typecheck only, should never be called!\n", __FUNCTION__ ); + return 0; +} + +void resync_timer_fn(unsigned long data) +{ + unsigned long flags; + drbd_dev* mdev = (drbd_dev*) data; + + spin_lock_irqsave(&mdev->req_lock,flags); + + if(likely(!test_and_clear_bit(STOP_SYNC_TIMER,&mdev->flags))) { + mdev->resync_work.cb = w_make_resync_request; + } else { + mdev->resync_work.cb = w_resume_next_sg; + } + + if(list_empty(&mdev->resync_work.list)) { + _drbd_queue_work(&mdev->data.work,&mdev->resync_work); + } else INFO("Avoided requeue of resync_work\n"); + + spin_unlock_irqrestore(&mdev->req_lock,flags); +} + +#define SLEEP_TIME (HZ/10) + +int w_make_resync_request(drbd_dev* mdev, struct drbd_work* w,int cancel) +{ + unsigned long bit; + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + int number,i,size; + + PARANOIA_BUG_ON(w != &mdev->resync_work); + + if(unlikely(cancel)) return 1; + + if(unlikely(mdev->cstate < Connected)) { + ERR("Confused in w_make_resync_request()! cstate < Connected"); + return 0; + } + + if (mdev->cstate != SyncTarget) { + ERR("%s in w_make_resync_request\n", cstate_to_name(mdev->cstate)); + } + + number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + + if (atomic_read(&mdev->rs_pending_cnt)>number) { + goto requeue; + } + number -= atomic_read(&mdev->rs_pending_cnt); + + for(i=0;iresync_work.cb = w_resync_inactive; + return 1; + } + + sector = BM_BIT_TO_SECT(bit); + + if(!drbd_rs_begin_io(mdev,sector)) { + // we have been interrupted, probably connection lost! + D_ASSERT(signal_pending(current)); + return 0; + } + + if(unlikely( drbd_bm_test_bit(mdev,bit) == 0 )) { + //INFO("Block got synced while in drbd_rs_begin_io()\n"); + drbd_rs_complete_io(mdev,sector); + goto next_sector; + } + + if (sector + (size>>9) > capacity) size = (capacity-sector)<<9; + inc_rs_pending(mdev); + if(!drbd_send_drequest(mdev,RSDataRequest, + sector,size,ID_SYNCER)) { + ERR("drbd_send_drequest() failed, aborting..."); + dec_rs_pending(mdev); + return 0; // FAILED. worker will abort! + } + } + + if(drbd_bm_rs_done(mdev)) { + /* last syncer _request_ was sent, + * but the RSDataReply not yet received. sync will end (and + * next sync group will resume), as soon as we receive the last + * resync data block, and the last bit is cleared. + * until then resync "work" is "inactive" ... + */ + mdev->resync_work.cb = w_resync_inactive; + return 1; + } + + requeue: + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + return 1; +} + +int drbd_resync_finished(drbd_dev* mdev) +{ + unsigned long db,dt,dbdt; + + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) dt=1; + db = mdev->rs_total; + dbdt = Bit2KB(db/dt); + mdev->rs_paused /= HZ; + INFO("Resync done (total %lu sec; paused %lu sec; %lu K/sec)\n", + dt + mdev->rs_paused, mdev->rs_paused, dbdt); + + if (mdev->cstate == SyncTarget || mdev->cstate == PausedSyncT) { + drbd_md_set_flag(mdev,MDF_Consistent); + ERR_IF(drbd_md_test_flag(mdev,MDF_FullSync)) + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_write(mdev); + } else if (mdev->cstate == SyncSource || mdev->cstate == PausedSyncS) { + set_bit(PARTNER_CONSISTENT, &mdev->flags); + } else { + ERR("unexpected cstate (%s) in drbd_resync_finished\n", + cstate_to_name(mdev->cstate)); + } + + // assert that all bit-map parts are cleared. + D_ASSERT(list_empty(&mdev->resync->lru)); + D_ASSERT(drbd_bm_total_weight(mdev) == 0); + mdev->rs_total = 0; + mdev->rs_paused = 0; + + set_cstate(mdev,Connected); + + return 1; +} + +int w_e_end_data_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok; + + if(unlikely(cancel)) { + spin_lock_irq(&mdev->ee_lock); + drbd_put_ee(mdev,e); + spin_unlock_irq(&mdev->ee_lock); + dec_unacked(mdev); + return 1; + } + + if(likely(drbd_bio_uptodate(&e->private_bio))) { + ok=drbd_send_block(mdev, DataReply, e); + } else { + ok=drbd_send_ack(mdev,NegDReply,e); + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegDReply. I guess it gets messy.\n"); + drbd_io_error(mdev); + } + + dec_unacked(mdev); + + spin_lock_irq(&mdev->ee_lock); + if( page_count(drbd_bio_get_page(&e->private_bio)) > 1 ) { + /* This might happen if sendpage() has not finished */ + list_add_tail(&e->w.list,&mdev->net_ee); + } else { + drbd_put_ee(mdev,e); + } + spin_unlock_irq(&mdev->ee_lock); + + if(unlikely(!ok)) ERR("drbd_send_block() failed\n"); + return ok; +} + +int w_e_end_rsdata_req(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + struct Tl_epoch_entry *e = (struct Tl_epoch_entry*)w; + int ok; + + if(unlikely(cancel)) { + spin_lock_irq(&mdev->ee_lock); + drbd_put_ee(mdev,e); + spin_unlock_irq(&mdev->ee_lock); + dec_unacked(mdev); + return 1; + } + + drbd_rs_complete_io(mdev,drbd_ee_get_sector(e)); + + if(likely(drbd_bio_uptodate(&e->private_bio))) { + if (likely( !test_bit(PARTNER_DISKLESS,&mdev->flags) )) { + inc_rs_pending(mdev); + ok=drbd_send_block(mdev, RSDataReply, e); + } else { + if (DRBD_ratelimit(5*HZ,5)) + ERR("Not sending RSDataReply, partner DISKLESS!\n"); + ok=1; + } + } else { + ok=drbd_send_ack(mdev,NegRSDReply,e); + if (DRBD_ratelimit(5*HZ,5)) + ERR("Sending NegDReply. I guess it gets messy.\n"); + drbd_io_error(mdev); + } + + dec_unacked(mdev); + + spin_lock_irq(&mdev->ee_lock); + if( page_count(drbd_bio_get_page(&e->private_bio)) > 1 ) { + /* This might happen if sendpage() has not finished */ + list_add_tail(&e->w.list,&mdev->net_ee); + } else { + drbd_put_ee(mdev,e); + } + spin_unlock_irq(&mdev->ee_lock); + + if(unlikely(!ok)) ERR("drbd_send_block() failed\n"); + return ok; +} + +int w_try_send_barrier(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + int ok=1; + + if(unlikely(cancel)) return ok; + + down(&mdev->data.mutex); + if(test_and_clear_bit(ISSUE_BARRIER,&mdev->flags)) { + ok = _drbd_send_barrier(mdev); + } + up(&mdev->data.mutex); + + return ok; +} + +int w_send_write_hint(drbd_dev *mdev, struct drbd_work *w, int cancel) +{ + if (cancel) return 1; + NOT_IN_26(clear_bit(UNPLUG_QUEUED,&mdev->flags)); + return drbd_send_short_cmd(mdev,UnplugRemote); +} + +STATIC void drbd_global_lock(void) +{ + int i; + + local_irq_disable(); + for (i=0; i < minor_count; i++) { + spin_lock(&drbd_conf[i].req_lock); + } +} + +STATIC void drbd_global_unlock(void) +{ + int i; + + for (i=0; i < minor_count; i++) { + spin_unlock(&drbd_conf[i].req_lock); + } + local_irq_enable(); +} + +STATIC void _drbd_rs_resume(drbd_dev *mdev) +{ + Drbd_CState ns; + + ns = mdev->cstate - (PausedSyncS - SyncSource); + D_ASSERT(ns == SyncSource || ns == SyncTarget); + + INFO("Syncer continues.\n"); + mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; + _set_cstate(mdev,ns); + + if(mdev->cstate == SyncTarget) { + ERR_IF(test_bit(STOP_SYNC_TIMER,&mdev->flags)) { + unsigned long rs_left = drbd_bm_total_weight(mdev); + clear_bit(STOP_SYNC_TIMER,&mdev->flags); + if (rs_left == 0) { + INFO("rs_left==0 in _drbd_rs_resume\n"); + } else { + ERR("STOP_SYNC_TIMER was set in " + "_drbd_rs_resume, but rs_left still %lu\n", + rs_left); + } + } + mod_timer(&mdev->resync_timer,jiffies); + } +} + + +STATIC void _drbd_rs_pause(drbd_dev *mdev) +{ + Drbd_CState ns; + + D_ASSERT(mdev->cstate == SyncSource || mdev->cstate == SyncTarget); + ns = mdev->cstate + (PausedSyncS - SyncSource); + + if(mdev->cstate == SyncTarget) set_bit(STOP_SYNC_TIMER,&mdev->flags); + + mdev->rs_mark_time = jiffies; + // mdev->rs_mark_left = drbd_bm_total_weight(mdev); // I don't care... + _set_cstate(mdev,ns); + INFO("Syncer waits for sync group.\n"); +} + +STATIC int _drbd_pause_higher_sg(drbd_dev *mdev) +{ + drbd_dev *odev; + int i,rv=0; + + for (i=0; i < minor_count; i++) { + odev = drbd_conf + i; + if ( odev->sync_conf.group > mdev->sync_conf.group + && ( odev->cstate == SyncSource || + odev->cstate == SyncTarget ) ) { + _drbd_rs_pause(odev); + rv = 1; + } + } + + return rv; +} + +STATIC int _drbd_lower_sg_running(drbd_dev *mdev) +{ + drbd_dev *odev; + int i,rv=0; + + for (i=0; i < minor_count; i++) { + odev = drbd_conf + i; + if ( odev->sync_conf.group < mdev->sync_conf.group + && ( odev->cstate == SyncSource || + odev->cstate == SyncTarget ) ) { + rv = 1; + } + } + + return rv; +} + +STATIC int _drbd_resume_lower_sg(drbd_dev *mdev) +{ + drbd_dev *odev; + int i,rv=0; + + for (i=0; i < minor_count; i++) { + odev = drbd_conf + i; + if ( odev->sync_conf.group < mdev->sync_conf.group + && ( odev->cstate == PausedSyncS || + odev->cstate == PausedSyncT ) ) { + _drbd_rs_resume(odev); + rv = 1; + } + } + + return rv; +} + +int w_resume_next_sg(drbd_dev* mdev, struct drbd_work* w, int unused) +{ + drbd_dev *odev; + int i,ng=10000; + + PARANOIA_BUG_ON(w != &mdev->resync_work); + + drbd_global_lock(); + + for (i=0; i < minor_count; i++) { + odev = drbd_conf + i; + if ( odev->sync_conf.group <= mdev->sync_conf.group + && ( odev->cstate == SyncSource || + odev->cstate == SyncTarget ) ) { + goto out; // Sync on an other device in this group + // or a lower group still runs. + } + } + + for (i=0; i < minor_count; i++) { // find next sync group + odev = drbd_conf + i; + if ( odev->sync_conf.group > mdev->sync_conf.group + && odev->sync_conf.group < ng && + (odev->cstate==PausedSyncS || odev->cstate==PausedSyncT)){ + ng = odev->sync_conf.group; + } + } + + for (i=0; i < minor_count; i++) { // resume all devices in next group + odev = drbd_conf + i; + if ( odev->sync_conf.group == ng && + (odev->cstate==PausedSyncS || odev->cstate==PausedSyncT)){ + _drbd_rs_resume(odev); + } + } + + out: + drbd_global_unlock(); + w->cb = w_resync_inactive; + + return 1; +} + +void drbd_alter_sg(drbd_dev *mdev, int ng) +{ + int c = 0, p = 0; + int d = (ng - mdev->sync_conf.group); + + drbd_global_lock(); + mdev->sync_conf.group = ng; + + if( ( mdev->cstate == PausedSyncS || + mdev->cstate == PausedSyncT ) && ( d < 0 ) ) { + if(_drbd_pause_higher_sg(mdev)) c=1; + else if(!_drbd_lower_sg_running(mdev)) c=1; + if(c) _drbd_rs_resume(mdev); + } + + if( ( mdev->cstate == SyncSource || + mdev->cstate == SyncTarget ) && ( d > 0 ) ) { + if(_drbd_resume_lower_sg(mdev)) p=1; + else if(_drbd_lower_sg_running(mdev)) p=1; + if(p) _drbd_rs_pause(mdev); + } + drbd_global_unlock(); +} + +void drbd_start_resync(drbd_dev *mdev, Drbd_CState side) +{ + if(side == SyncTarget) { + drbd_md_clear_flag(mdev,MDF_Consistent); + drbd_bm_reset_find(mdev); + } else if (side == SyncSource) { + clear_bit(PARTNER_CONSISTENT, &mdev->flags); + /* If we are SyncSource we must be consistent. + * FIXME this should be an assertion only, + * otherwise it masks a logic bug somewhere else... + */ + ERR_IF (!drbd_md_test_flag(mdev,MDF_Consistent)) { + // FIXME this is actually a BUG()! + drbd_md_set_flag(mdev,MDF_Consistent); + } + } else { + ERR("Usage error in drbd_start_resync! (side == %s)\n", + cstate_to_name(side)); + return; + } + drbd_md_write(mdev); + + set_cstate(mdev,side); + mdev->rs_total = + mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_paused = 0; + mdev->rs_start = + mdev->rs_mark_time = jiffies; + + INFO("Resync started as %s (need to sync %lu KB [%lu bits set]).\n", + cstate_to_name(side), + (unsigned long) mdev->rs_total << (BM_BLOCK_SIZE_B-10), + (unsigned long) mdev->rs_total); + + // FIXME: this was a PARANOIA_BUG_ON, but it triggered! ?? + if (mdev->resync_work.cb != w_resync_inactive) { + if (mdev->resync_work.cb == w_make_resync_request) + ERR("resync_work.cb == w_make_resync_request, should be w_resync_inactive\n"); + else if (mdev->resync_work.cb == w_resume_next_sg) + ERR("resync_work.cb == w_resume_next_sg, should be w_resync_inactive\n"); + else + ERR("resync_work.cb == %p ???, should be w_resync_inactive\n", + mdev->resync_work.cb); + return; + } + + if ( mdev->rs_total == 0 ) { + drbd_resync_finished(mdev); + return; + } + + drbd_global_lock(); + if (mdev->cstate == SyncTarget || mdev->cstate == SyncSource) { + _drbd_pause_higher_sg(mdev); + if(_drbd_lower_sg_running(mdev)) { + _drbd_rs_pause(mdev); + } + } /* else: + * thread of other mdev already paused us, + * or something very strange happend to our cstate! + * I really hate it that we can't have a consistent view of cstate. + */ + drbd_global_unlock(); + + if (mdev->cstate == SyncTarget) { + D_ASSERT(!test_bit(STOP_SYNC_TIMER,&mdev->flags)); + mod_timer(&mdev->resync_timer,jiffies); + } else if (mdev->cstate == PausedSyncT) { + D_ASSERT(test_bit(STOP_SYNC_TIMER,&mdev->flags)); + clear_bit(STOP_SYNC_TIMER,&mdev->flags); + } +} + +int drbd_worker(struct Drbd_thread *thi) +{ + drbd_dev *mdev = thi->mdev; + struct drbd_work *w = 0; + LIST_HEAD(work_list); + int intr,i; + + sprintf(current->comm, "drbd%d_worker", (int)(mdev-drbd_conf)); + + for (;;) { + intr = down_interruptible(&mdev->data.work.s); + + if (unlikely(drbd_did_panic == DRBD_MAGIC)) { + drbd_suicide(); + } + + if (intr) { + D_ASSERT(intr == -EINTR); + drbd_flush_signals(current); + ERR_IF (get_t_state(thi) == Running) + continue; + break; + } + + if (get_t_state(thi) != Running) break; + /* With this break, we have done an down() but not consumed + the entry from the list. The cleanup code takes care of + this... */ + + w = 0; + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!list_empty(&mdev->data.work.q)); + w = list_entry(mdev->data.work.q.next,struct drbd_work,list); + list_del_init(&w->list); + spin_unlock_irq(&mdev->req_lock); + + if(!w->cb(mdev,w, mdev->cstate < Connected )) { + //WARN("worker: a callback failed! \n"); + if (mdev->cstate >= Connected) + set_cstate(mdev,NetworkFailure); + drbd_thread_restart_nowait(&mdev->receiver); + } + } + + drbd_wait_ee(mdev,&mdev->read_ee); + + i = 0; + spin_lock_irq(&mdev->req_lock); + again: + list_splice_init(&mdev->data.work.q,&work_list); + spin_unlock_irq(&mdev->req_lock); + + while(!list_empty(&work_list)) { + w = list_entry(work_list.next, struct drbd_work,list); + list_del_init(&w->list); + w->cb(mdev,w,1); + i++; + } + + spin_lock_irq(&mdev->req_lock); + ERR_IF(!list_empty(&mdev->data.work.q)) + goto again; + sema_init(&mdev->data.work.s,0); + spin_unlock_irq(&mdev->req_lock); + + INFO("worker terminated\n"); + + return 0; +} --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/hlist.h 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,129 @@ +#ifndef HLIST_HEAD_INIT +#ifndef HLIST_H +#define HLIST_H + +#ifdef REDHAT_HLIST_BACKPORT +#undef hlist_node +#undef hlist_head +#undef HLIST_HEAD +#undef INIT_HLIST_HEAD +#undef hlist_empty +#undef hlist_del_init +#undef hlist_entry +#undef hlist_add_head +#undef hlist_for_each +#undef hlist_for_each_safe +#endif + +// from linux-2.6.x linux/list.h +// I copied only the part which actually is used in lru_cache.h + +// ok, this is from linux/kernel.h +/** + * container_of - cast a member of a structure out to the containing structure + * + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + + +#define HLIST_HEAD_INIT { .first = NULL } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) +#define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL) + +static __inline__ int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static __inline__ int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static __inline__ void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +#ifndef LIST_POISON1 +#define LIST_POISON1 ((void *) 0x00100100) +#define LIST_POISON2 ((void *) 0x00200200) +#endif + +static __inline__ void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +static __inline__ void hlist_del_init(struct hlist_node *n) +{ + if (n->pprev) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +/* Cannot easily do prefetch unfortunately */ +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \ + pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; n = pos ? pos->next : 0, pos; \ + pos = n) + +/** + * hlist_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop counter. + * @pos: the &struct hlist_node to use as a loop counter. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + pos && ({ prefetch(pos->next); 1;}) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +#endif +#endif --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/lru_cache.c 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,289 @@ +/* +-*- linux-c -*- + lru_cache.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 2003-2004, Philipp Reisner . + Copyright (C) 2003-2004, Lars Ellenberg . + authors. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ +#include // for likely() +#include +#include +#include // for memset +#include "lru_cache.h" + +#define STATIC static + +// this is developers aid only! +#define PARANOIA_ENTRY() BUG_ON(test_and_set_bit(__LC_PARANOIA,&lc->flags)) +#define PARANOIA_LEAVE() do { clear_bit(__LC_PARANOIA,&lc->flags); smp_mb__after_clear_bit(); } while (0) +#define RETURN(x...) do { PARANOIA_LEAVE(); return x ; } while (0) + +/** + * lc_alloc: allocates memory for @e_count objects of @e_size bytes plus the + * struct lru_cache, and the hash table slots. + * returns pointer to a newly initialized lru_cache object with said parameters. + */ +struct lru_cache* lc_alloc(unsigned int e_count, size_t e_size, + void *private_p) +{ + unsigned long bytes; + struct lru_cache *lc; + struct lc_element *e; + int i; + + BUG_ON(!e_count); + e_size = max(sizeof(struct lc_element),e_size); + bytes = e_size+sizeof(struct hlist_head); + bytes *= e_count; + bytes += sizeof(struct lru_cache); + lc = vmalloc(bytes); + memset(lc, 0, bytes); + if (lc) { + INIT_LIST_HEAD(&lc->in_use); + INIT_LIST_HEAD(&lc->lru); + INIT_LIST_HEAD(&lc->free); + lc->element_size = e_size; + lc->nr_elements = e_count; + lc->new_number = -1; + lc->lc_private = private_p; + for(i=0;ilc_number = LC_FREE; + list_add(&e->list,&lc->free); + // memset(,0,) did the rest of init for us + } + } + return lc; +} + +/** + * lc_free: Frees memory allocated by lc_alloc. + * @lc: The lru_cache object + */ +void lc_free(struct lru_cache* lc) +{ + vfree(lc); +} + +static unsigned int lc_hash_fn(struct lru_cache* lc, unsigned int enr) +{ + return enr % lc->nr_elements; +} + + +/** + * lc_find: Returns the pointer to an element, if the element is present + * in the hash table. In case it is not this function returns NULL. + * @lc: The lru_cache object + * @enr: element number + */ +struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr) +{ + struct hlist_node *n; + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + hlist_for_each_entry(e, n, lc->slot + lc_hash_fn(lc, enr), colision) { + if (e->lc_number == enr) return e; + } + return NULL; +} + +STATIC struct lc_element * lc_evict(struct lru_cache* lc) +{ + struct list_head *n; + struct lc_element *e; + + if (list_empty(&lc->lru)) return 0; + + n=lc->lru.prev; + e=list_entry(n, struct lc_element,list); + + list_del(&e->list); + hlist_del(&e->colision); + return e; +} + +/** + * lc_del: Removes an element from the cache (and therefore adds the + * element's storage to the free list) + * + * @lc: The lru_cache object + * @e: The element to remove + */ +void lc_del(struct lru_cache* lc, struct lc_element *e) +{ + // FIXME what to do with refcnt != 0 ? + PARANOIA_ENTRY(); + BUG_ON(e->refcnt); + list_del(&e->list); + hlist_del_init(&e->colision); + e->lc_number = LC_FREE; + e->refcnt = 0; + list_add(&e->list,&lc->free); + RETURN(); +} + +STATIC struct lc_element* lc_get_unused_element(struct lru_cache* lc) +{ + struct list_head *n; + + if (list_empty(&lc->free)) return lc_evict(lc); + + n=lc->free.next; + list_del(n); + return list_entry(n, struct lc_element,list); +} + +STATIC int lc_unused_element_available(struct lru_cache* lc) +{ + if (!list_empty(&lc->free)) return 1; // something on the free list + if (!list_empty(&lc->lru)) return 1; // something to evict + + return 0; +} + + +/** + * lc_get: Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * In case the requested number is not present, it needs to be added to the + * cache. Therefore it is possible that an other element becomes eviced from + * the cache. In either case, the user is notified so he is able to e.g. keep + * a persistent log of the cache changes, and therefore the objects in use. + * + * Return values: + * NULL if the requested element number was not in the cache, and no unused + * element could be recycled + * pointer to the element with the REQUESTED element number + * In this case, it can be used right away + * + * pointer to an UNUSED element with some different element number. + * In this case, the cache is marked dirty, and the returned element + * pointer is removed from the lru list and hash collision chains. + * The user now should do whatever houskeeping is necessary. Then he + * needs to call lc_element_changed(lc,element_pointer), to finish the + * change. + * + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes + * any cache set change. + * + * @lc: The lru_cache object + * @enr: element number + */ +struct lc_element* lc_get(struct lru_cache* lc, unsigned int enr) +{ + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + + PARANOIA_ENTRY(); + if ( lc->flags & LC_STARVING ) RETURN(NULL); + + e = lc_find(lc, enr); + if (e) { + ++e->refcnt; + list_move(&e->list,&lc->in_use); // Not evictable... + RETURN(e); + } + + /* In case there is nothing available and we can not kick out + * the LRU element, we have to wait ... + */ + if(!lc_unused_element_available(lc)) { + __set_bit(__LC_STARVING,&lc->flags); + RETURN(NULL); + } + + /* it was not present in the cache, find an unused element, + * which then is replaced. + * we need to update the cache; serialize on lc->flags & LC_DIRTY + */ + if (test_and_set_bit(__LC_DIRTY,&lc->flags)) RETURN(NULL); + + e = lc_get_unused_element(lc); + BUG_ON(!e); + + clear_bit(__LC_STARVING,&lc->flags); + BUG_ON(++e->refcnt != 1); + + lc->changing_element = e; + lc->new_number = enr; + + RETURN(e); +} + +void lc_changed(struct lru_cache* lc, struct lc_element* e) +{ + PARANOIA_ENTRY(); + BUG_ON(e != lc->changing_element); + e->lc_number = lc->new_number; + list_add(&e->list,&lc->in_use); + hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc, lc->new_number) ); + lc->changing_element = NULL; + lc->new_number = -1; + clear_bit(__LC_DIRTY,&lc->flags); + smp_mb__after_clear_bit(); + PARANOIA_LEAVE(); +} + + +unsigned int lc_put(struct lru_cache* lc, struct lc_element* e) +{ + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + BUG_ON(!e); + + PARANOIA_ENTRY(); + BUG_ON(e->refcnt == 0); + if ( --e->refcnt == 0) { + list_move(&e->list,&lc->lru); // move it to the front of LRU. + clear_bit(__LC_STARVING,&lc->flags); + smp_mb__after_clear_bit(); + } + RETURN(e->refcnt); +} + + +/** + * lc_set: Sets an element in the cache. You might use this function to + * setup the cache. It is expected that the elements are properly initialized. + * @lc: The lru_cache object + * @enr: element number + * @index: The elements' position in the cache + */ +void lc_set(struct lru_cache* lc, unsigned int enr, int index) +{ + struct lc_element *e; + + if ( index < 0 || index >= lc->nr_elements ) return; + + e = lc_entry(lc,index); + e->lc_number = enr; + + hlist_del_init(&e->colision); + hlist_add_head( &e->colision, lc->slot + lc_hash_fn(lc,enr) ); + list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); +} + --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/lru_cache.h 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,144 @@ +/* +-*- linux-c -*- + lru_cache.c + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + Copyright (C) 2003-2004, Philipp Reisner . + main author. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +/* + The lru_cache describes a big set of objects that are addressed + by an index number (=lc_number). Only a small fraction of this set + is present in the cache. + (You set the size of the cache using lc_resize) + Once created, the api consists of + lc_find(,nr) -- finds the object with the given number, if present + lc_get(,nr) -- finds the object and increases the usage count + if not present, actions are taken to make sure that + the cache is updated, the user is notified of this by a callback. + Return value is NULL in this case. + As soon as the user informs the cache that it has been updated, + the next lc_get on that very object number will be successfull. + lc_put(,lc_element*) + -- decreases the usage count of this object, and returns the new value. + + NOTE: It is the USERS responsibility to make sure that calls do not happen concurrently. + */ + +#ifndef LRU_CACHE_H +#define LRU_CACHE_H + +#include +#ifndef HLIST_HEAD_INIT +# include "hlist.h" +#endif + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION (2,4,20) +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} +#endif + +#ifndef max +// For RH 2.4.9 +# define max(x,y) \ + ({ typeof(x) __x = (x); typeof(y) __y = (y); \ + (void)(&__x == &__y); \ + __x > __y ? __x: __y; }) +#endif + +#ifndef BUG_ON + /* for ancient 2.4 kernels */ +# define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0) +#endif + +struct lc_element { + struct hlist_node colision; + struct list_head list; // LRU list or free list + unsigned int refcnt; + unsigned int lc_number; +}; + +struct lru_cache { + struct list_head lru; + struct list_head free; + struct list_head in_use; + size_t element_size; + unsigned int nr_elements; + unsigned int new_number; + unsigned long flags; + struct lc_element *changing_element; // just for paranoia + + void *lc_private; + + struct hlist_head slot[0]; + // hash colision chains here, then element storage. +}; + + +// flag-bits for lru_cache +enum { + __LC_PARANOIA, + __LC_DIRTY, + __LC_STARVING, +}; +#define LC_PARANOIA (1<<__LC_PARANOIA) +#define LC_DIRTY (1<<__LC_DIRTY) +#define LC_STARVING (1<<__LC_STARVING) + +extern struct lru_cache* lc_alloc(unsigned int e_count, size_t e_size, + void *private_p); +extern void lc_free(struct lru_cache* lc); +extern void lc_set (struct lru_cache* lc, unsigned int enr, int index); +extern void lc_del (struct lru_cache* lc, struct lc_element *element); + +extern struct lc_element* lc_find(struct lru_cache* lc, unsigned int enr); +extern struct lc_element* lc_get (struct lru_cache* lc, unsigned int enr); +extern unsigned int lc_put (struct lru_cache* lc, struct lc_element* e); +extern void lc_changed(struct lru_cache* lc, struct lc_element* e); + + +/* This can be used to stop lc_get from changing the set of active elements. + * Note that the reference counts and order on the lru list may still change. + * returns true if we aquired the lock. + */ +static inline int lc_try_lock(struct lru_cache* lc) +{ + return !test_and_set_bit(__LC_DIRTY,&lc->flags); +} + +static inline void lc_unlock(struct lru_cache* lc) +{ + clear_bit(__LC_DIRTY,&lc->flags); + smp_mb__after_clear_bit(); +} + +#define LC_FREE (-1) + +#define lc_e_base(lc) ((char*) ( (lc)->slot + (lc)->nr_elements ) ) +#define lc_entry(lc,i) ((struct lc_element*) \ + (lc_e_base(lc) + (i)*(lc)->element_size)) +#define lc_index_of(lc,e) (((char*)(e) - lc_e_base(lc))/(lc)->element_size) + +#endif --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./drivers/block/drbd/mempool.h 2006-06-21 16:57:01.000000000 +0400 @@ -0,0 +1,49 @@ +/* + * memory buffer pool support + */ +#ifndef _LINUX_MEMPOOL_H +#define _LINUX_MEMPOOL_H + +#include +#include + +typedef void * (mempool_alloc_t)(int gfp_mask, void *pool_data); +typedef void (mempool_free_t)(void *element, void *pool_data); + +/* + * A structure for linking multiple client objects into + * a mempool_t + */ +typedef struct mempool_node_s { + struct list_head list; + void *element; +} mempool_node_t; + +/* + * The elements list has full mempool_node_t's at ->next, and empty ones + * at ->prev. Emptiness is signified by mempool_node_t.element == NULL. + * + * curr_nr refers to how many full mempool_node_t's are at ->elements. + * We don't track the total number of mempool_node_t's at ->elements; + * it is always equal to min_nr. + */ +typedef struct mempool_s { + spinlock_t lock; + int min_nr, curr_nr; + struct list_head elements; + + void *pool_data; + mempool_alloc_t *alloc; + mempool_free_t *free; + wait_queue_head_t wait; +} mempool_t; +extern mempool_t * mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +extern void mempool_resize(mempool_t *pool, int new_min_nr, int gfp_mask); +extern void mempool_destroy(mempool_t *pool); +extern void * mempool_alloc(mempool_t *pool, int gfp_mask); +extern void mempool_free(void *element, mempool_t *pool); +extern void *mempool_alloc_slab(int gfp_mask, void *pool_data); +extern void mempool_free_slab(void *element, void *pool_data); + +#endif /* _LINUX_MEMPOOL_H */ --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./include/linux/drbd.h 2006-06-21 16:58:59.000000000 +0400 @@ -0,0 +1,247 @@ +/* + drbd.h + Kernel module for 2.4.x/2.6.x Kernels + + This file is part of drbd by Philipp Reisner. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ +#ifndef DRBD_H +#define DRBD_H +#include + +#include + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include +#include +#include +#endif + +#ifdef __KERNEL__ +#define IN const +#define OUT +#define INOUT +#else +#define IN +#define OUT const +#define INOUT +#endif + +/* + - Never forget to place bigger members before the smaller ones, + to avoid unaligned placement of members on 64 bit architectures. + - Never forget to add explicit _pad members to make sizeof(struct) + divisible by 8. +*/ + +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 + must be at least one bigger than + the AF_UNIX size (see net/unix/af_unix.c + :unix_mkname()). + */ + +enum io_error_handler { + PassOn, + Panic, + Detach +}; + + +struct disk_config { + IN __u64 disk_size; + IN int lower_device; + IN enum io_error_handler on_io_error; + IN int meta_device; + IN int meta_index; +}; + +enum disconnect_handler { + Reconnect, + DropNetConf, + FreezeIO +}; + +struct net_config { + IN char my_addr[MAX_SOCK_ADDR]; + IN char other_addr[MAX_SOCK_ADDR]; + IN int my_addr_len; + IN int other_addr_len; + IN int timeout; // deci seconds + IN int wire_protocol; + IN int try_connect_int; /* seconds */ + IN int ping_int; /* seconds */ + IN int max_epoch_size; + IN int max_buffers; + IN int sndbuf_size; /* socket send buffer size */ + IN unsigned int ko_count; + IN enum disconnect_handler on_disconnect; + const int _pad; +}; + +struct syncer_config { + int rate; /* KB/sec */ + int use_csums; /* use checksum based syncing*/ + int skip; + int group; + int al_extents; + const int _pad; +}; + +/* KEEP the order, do not delete or insert! + * Or change the API_VERSION, too. */ +enum ret_codes { + NoError=0, + LAAlreadyInUse, + OAAlreadyInUse, + LDFDInvalid, + MDFDInvalid, + LDAlreadyInUse, + LDNoBlockDev, + MDNoBlockDev, + LDOpenFailed, + MDOpenFailed, + LDDeviceTooSmall, + MDDeviceTooSmall, + LDNoConfig, + LDMounted, + MDMounted, + LDMDInvalid, + LDDeviceTooLarge, + MDIOError +}; + +struct ioctl_disk_config { + struct disk_config config; + OUT enum ret_codes ret_code; + const int _pad; +}; + +struct ioctl_net_config { + struct net_config config; + OUT enum ret_codes ret_code; + const int _pad; +}; + +struct ioctl_syncer_config { + struct syncer_config config; + OUT enum ret_codes ret_code; + const int _pad; +}; + +struct ioctl_wait { + IN int wfc_timeout; + IN int degr_wfc_timeout; + OUT int ret_code; + int _pad; +}; + +#define DRBD_PROT_A 1 +#define DRBD_PROT_B 2 +#define DRBD_PROT_C 3 + +typedef enum { + Unknown=0, + Primary=1, // role + Secondary=2, // role + Human=4, // flag for set_state + TimeoutExpired=8, // flag for set_state + DontBlameDrbd=16 // flag for set_state +} Drbd_State; + +/* The order of these constants is important. + * The lower ones (=WFReportParams ==> There is a socket + * + * THINK + * Skipped should be < Connected, + * so writes on a Primary after Skipped sync are not mirrored either ? + */ +typedef enum { + Unconfigured, + StandAlone, + Unconnected, + Timeout, + BrokenPipe, + NetworkFailure, + WFConnection, + WFReportParams, // we have a socket + Connected, // we have introduced each other + SkippedSyncS, // we should have synced, but user said no + SkippedSyncT, + WFBitMapS, + WFBitMapT, + SyncSource, // The distance between original state and pause + SyncTarget, // state must be the same for source and target. (+2) + PausedSyncS, // see _drbd_rs_resume() and _drbd_rs_pause() + PausedSyncT, // is sync target, but higher priority groups first +} Drbd_CState; + +#ifndef BDEVNAME_SIZE +# define BDEVNAME_SIZE 32 +#endif + +struct ioctl_get_config { + OUT __u64 disk_size_user; + OUT char lower_device_name[BDEVNAME_SIZE]; + OUT char meta_device_name[BDEVNAME_SIZE]; + struct net_config nconf; + struct syncer_config sconf; + OUT int lower_device_major; + OUT int lower_device_minor; + OUT enum io_error_handler on_io_error; + OUT int meta_device_major; + OUT int meta_device_minor; + OUT int meta_index; + OUT Drbd_CState cstate; + OUT Drbd_State state; + OUT Drbd_State peer_state; + int _pad; +}; + +#define DRBD_MAGIC 0x83740267 +#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) + +/* 'D' already taken by s390 dasd driver. + * maybe we want to change to something else, and register it officially? + */ +#define DRBD_IOCTL_LETTER 'D' +#define DRBD_IOCTL_GET_VERSION _IOR( DRBD_IOCTL_LETTER, 0x00, int ) +#define DRBD_IOCTL_SET_STATE _IOW( DRBD_IOCTL_LETTER, 0x02, Drbd_State ) +#define DRBD_IOCTL_SET_DISK_CONFIG _IOW( DRBD_IOCTL_LETTER, 0x06, struct ioctl_disk_config ) +#define DRBD_IOCTL_SET_NET_CONFIG _IOW( DRBD_IOCTL_LETTER, 0x07, struct ioctl_net_config ) +#define DRBD_IOCTL_UNCONFIG_NET _IO ( DRBD_IOCTL_LETTER, 0x08 ) +#define DRBD_IOCTL_GET_CONFIG _IOW( DRBD_IOCTL_LETTER, 0x0A, struct ioctl_get_config ) +#define DRBD_IOCTL_INVALIDATE _IO ( DRBD_IOCTL_LETTER, 0x0D ) +#define DRBD_IOCTL_INVALIDATE_REM _IO ( DRBD_IOCTL_LETTER, 0x0E ) +#define DRBD_IOCTL_SET_SYNC_CONFIG _IOW( DRBD_IOCTL_LETTER, 0x0F, struct ioctl_syncer_config ) +#define DRBD_IOCTL_SET_DISK_SIZE _IOW( DRBD_IOCTL_LETTER, 0x10, unsigned int ) +#define DRBD_IOCTL_WAIT_CONNECT _IOR( DRBD_IOCTL_LETTER, 0x11, struct ioctl_wait ) +#define DRBD_IOCTL_WAIT_SYNC _IOR( DRBD_IOCTL_LETTER, 0x12, struct ioctl_wait ) +#define DRBD_IOCTL_UNCONFIG_DISK _IO ( DRBD_IOCTL_LETTER, 0x13 ) +#define DRBD_IOCTL_SET_STATE_FLAGS _IOW( DRBD_IOCTL_LETTER, 0x14, Drbd_State ) + + +#endif + --- /dev/null 2006-06-16 16:00:55.900357250 +0400 +++ ./include/linux/drbd_config.h 2006-06-21 16:58:59.000000000 +0400 @@ -0,0 +1,72 @@ +/* + drbd_config.h + DRBD's compile time configuration. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef DRBD_CONFIG_H +#define DRBD_CONFIG_H + +extern const char * drbd_buildtag(void); + +#define REL_VERSION "0.7.19" +#define API_VERSION 78 +#define PRO_VERSION 74 + +//#define DBG_ALL_SYMBOLS // no static functs, improves quality of OOPS traces + +//#define DBG_SPINLOCKS // enables MUST_HOLD macro (assertions for spinlocks) +//#define DBG_ASSERTS // drbd_assert_breakpoint() function +//#define DUMP_MD 1 // Dump metadata to syslog upon connect +#define DUMP_MD 2 // Dump even all cstate changes (I like it!) +//#define DUMP_MD 3 // Dump even all meta data access + // (don't! unless we track down a bug...) + +//#define SIGHAND_HACK // Needed for RH 2.4.20 and later kernels. +//#define REDHAT_HLIST_BACKPORT // Makes DRBD work on RH9 kernels + +/* some redhat 2.4.X-Y.Z.whatever kernel flavours have an mm_inline.h, + * which needs to be included explicitly. most 2.4.x kernels don't have that + * header file at all. So uncomment for these, and ignore for all others. + * in 2.6., it will be included anyways. + */ +//#define HAVE_MM_INLINE_H + +//Your 2.4 verndor kernel already defines find_next_bit() +//#define HAVE_FIND_NEXT_BIT + +//Your 2.4 kernel does not define find_next_bit(), +//and you are too lazy to "backport" it from 2.6 for your arch: +//#define USE_GENERIC_FIND_NEXT_BIT + +//#define PARANOIA // some extra checks + +// don't enable this, unless you can cope with gigabyte syslogs :) +//#define DUMP_EACH_PACKET + +// Dump every hour the usage / not usage of zero copy IO +//#define SHOW_SENDPAGE_USAGE + +// You can disable the use of the sendpage() call (= zero copy +// IO ) If you have the feeling that this might be the cause +// for troubles. +// #define DRBD_DISABLE_SENDPAGE + +// older, unpached kernel sources do not have it, so we added it in drbd_receiver.c +// but more recent kernels define it in arch/um/include/mem.h +#define HAVE_UML_TO_VIRT + +#endif