This diff requires a kernel source already patched with raid0145-19990824-2.2.11 patch (e.g. 2.2.11 or 2.2.12 or 2.2.13 - you can safely ignore the rejects when applying the raid patch). Changes: - new, better readable and parseable /proc/mdstat - fixed some typos in comments TODO: - how do I find out md device count (md0, md1, ..., md) ? Look out for new xosview that reads new /proc/mdstat and displays your RAID state. Thomas diff -c -r linux-2.2.13-RAID/drivers/block/md.c linux-2.2.13-RAID-tw1/drivers/block/md.c *** linux-2.2.13-RAID/drivers/block/md.c Mon Nov 8 01:28:27 1999 --- linux-2.2.13-RAID-tw1/drivers/block/md.c Mon Nov 8 01:57:48 1999 *************** *** 11,17 **** - kerneld support by Boris Tobotras - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher ! This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) --- 11,19 ---- - kerneld support by Boris Tobotras - kmod support by: Cyrus Durgin - RAID0 bugfixes: Mark Anthony Lisher ! - easier parseable /proc/mdstat by ! look out for xosview displaying your RAID status soon! ! This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) *************** *** 85,91 **** * the RAID driver will use the maximum available bandwith if the IO * subsystem is idle. * ! * you can change it via /proc/sys/dev/speed-limit */ static int sysctl_speed_limit = 100; --- 87,93 ---- * the RAID driver will use the maximum available bandwith if the IO * subsystem is idle. * ! * you can change it via /proc/sys/dev/md/speed-limit */ static int sysctl_speed_limit = 100; *************** *** 119,125 **** } /* ! * The mapping between kdev and mddev is not necessary a simple * one! Eg. HSM uses several sub-devices to implement Logical * Volumes. All these sub-devices map to the same mddev. */ --- 121,127 ---- } /* ! * The mapping between kdev and mddev is not necessarily a simple * one! Eg. HSM uses several sub-devices to implement Logical * Volumes. All these sub-devices map to the same mddev. */ *************** *** 669,675 **** MD_BUG(); rdev->inode = get_empty_inode(); /* ! * we dont care about any other fields */ rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; insert_inode_hash(rdev->inode); --- 671,677 ---- MD_BUG(); rdev->inode = get_empty_inode(); /* ! * we don't care about any other fields */ rdev->inode->i_dev = rdev->inode->i_rdev = rdev->dev; insert_inode_hash(rdev->inode); *************** *** 916,922 **** } /* * If the disk went offline meanwhile and it's just a spare, then ! * it's size has changed to zero silently, and the MD code does * not yet know that it's faulty. */ size = calc_dev_size(dev, rdev->mddev, 1); --- 918,924 ---- } /* * If the disk went offline meanwhile and it's just a spare, then ! * its size has changed to zero silently, and the MD code does * not yet know that it's faulty. */ size = calc_dev_size(dev, rdev->mddev, 1); *************** *** 1217,1223 **** __u64 ev1, ev2; /* * if the checksum is invalid, use the superblock ! * only as a last resort. (decrease it's age by * one event) */ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { --- 1219,1225 ---- __u64 ev1, ev2; /* * if the checksum is invalid, use the superblock ! * only as a last resort. (decrease its age by * one event) */ if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) { *************** *** 1369,1375 **** } /* ! * Double check wether all devices mentioned in the * superblock are in the rdev ring. */ for (i = 0; i < MD_SB_DISKS; i++) { --- 1371,1377 ---- } /* ! * Double check whether all devices mentioned in the * superblock are in the rdev ring. */ for (i = 0; i < MD_SB_DISKS; i++) { *************** *** 1602,1608 **** /* * 'default chunksize' in the old md code used to * be PAGE_SIZE, baaad. ! * we abort here to be on the safe side. We dont * want to continue the bad practice. */ printk(BAD_CHUNKSIZE); --- 1604,1610 ---- /* * 'default chunksize' in the old md code used to * be PAGE_SIZE, baaad. ! * we abort here to be on the safe side. We don't * want to continue the bad practice. */ printk(BAD_CHUNKSIZE); *************** *** 1839,1845 **** } /* ! * lets try to run arrays based on all disks that have arrived * until now. (those are in the ->pending list) * * the method: pick the first pending disk, collect all disks with --- 1841,1847 ---- } /* ! * let's try to run arrays based on all disks that have arrived * until now. (those are in the ->pending list) * * the method: pick the first pending disk, collect all disks with *************** *** 2959,2965 **** thread->tsk = current; /* ! * md_thread is a 'system-thread', it's priority should be very * high. We avoid resource deadlocks individually in each * raid personality. (RAID5 does preallocation) We also use RR and * the very same RT priority as kswapd, thus we will never get --- 2961,2967 ---- thread->tsk = current; /* ! * md_thread is a 'system-thread', its priority should be very * high. We avoid resource deadlocks individually in each * raid personality. (RAID5 does preallocation) We also use RR and * the very same RT priority as kswapd, thus we will never get *************** *** 3103,3140 **** static int status_unused (char * page) { ! int sz = 0, i = 0; mdk_rdev_t *rdev; struct md_list_head *tmp; ! sz += sprintf(page + sz, "unused devices: "); ITERATE_RDEV_ALL(rdev,tmp) { if (!rdev->same_set.next && !rdev->same_set.prev) { /* * The device is not yet used by any array. */ ! i++; ! sz += sprintf(page + sz, "%s ", ! partition_name(rdev->dev)); } } ! if (!i) ! sz += sprintf(page + sz, ""); ! ! sz += sprintf(page + sz, "\n"); return sz; } static int status_resync (char * page, mddev_t * mddev) { ! int sz = 0; unsigned int blocksize, max_blocks, resync, res, dt, tt, et; resync = mddev->curr_resync; ! blocksize = blksize_size[MD_MAJOR][mdidx(mddev)]; ! max_blocks = blk_size[MD_MAJOR][mdidx(mddev)] / (blocksize >> 10); /* * Should not happen. --- 3105,3153 ---- static int status_unused (char * page) { ! int sz = 0; mdk_rdev_t *rdev; struct md_list_head *tmp; ! sz += sprintf(page+sz, "%-30s", "md_unused_devices"); ITERATE_RDEV_ALL(rdev,tmp) { if (!rdev->same_set.next && !rdev->same_set.prev) { /* * The device is not yet used by any array. */ ! sz += sprintf(page+sz, "%s ", partition_name(rdev->dev)); } } ! sz += sprintf(page+sz, "\n"); return sz; } static int status_resync (char * page, mddev_t * mddev) { ! int sz=0, mdnum; unsigned int blocksize, max_blocks, resync, res, dt, tt, et; + char str[256]; + + mdnum=mdidx(mddev); + + sprintf(str, "md_resync_status.%d", mdnum); + sz += sprintf(page+sz, "%-30s", str); resync = mddev->curr_resync; ! ! if (!resync){ ! if (md_atomic_read(&mddev->resync_sem.count) != 1) ! sz += sprintf(page+sz, "RESYNC_DELAYED"); ! else ! sz += sprintf(page+sz, "IDLE"); ! sz += sprintf(page+sz, "\n"); ! return sz; ! } ! ! blocksize = blksize_size[MD_MAJOR][mdnum]; ! max_blocks = blk_size[MD_MAJOR][mdnum] / (blocksize >> 10); /* * Should not happen. *************** *** 3148,3159 **** /* * true resync */ ! sz += sprintf(page + sz, " resync=%u%%", res); else /* * recovery ... */ ! sz += sprintf(page + sz, " recovery=%u%%", res); /* * We do not want to overflow, so the order of operands and --- 3161,3172 ---- /* * true resync */ ! sz += sprintf(page+sz, "resync=%u%%", res); else /* * recovery ... */ ! sz += sprintf(page+sz, "recovery=%u%%", res); /* * We do not want to overflow, so the order of operands and *************** *** 3174,3250 **** */ et = 0; ! sz += sprintf(page + sz, " finish=%u.%umin", et / 60, (et % 60)/6); return sz; } int get_md_status (char *page) { ! int sz = 0, j, size; ! struct md_list_head *tmp, *tmp2; mdk_rdev_t *rdev; mddev_t *mddev; ! sz += sprintf(page + sz, "Personalities : "); for (j = 0; j < MAX_PERSONALITY; j++) ! if (pers[j]) ! sz += sprintf(page+sz, "[%s] ", pers[j]->name); ! sz += sprintf(page+sz, "\n"); ! sz += sprintf(page+sz, "read_ahead "); if (read_ahead[MD_MAJOR] == INT_MAX) ! sz += sprintf(page+sz, "not set\n"); else ! sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); ! ITERATE_MDDEV(mddev,tmp) { ! sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), ! mddev->pers ? "" : "in"); if (mddev->pers) { ! if (mddev->ro) ! sz += sprintf(page + sz, " (read-only)"); ! sz += sprintf(page + sz, " %s", mddev->pers->name); } size = 0; ! ITERATE_RDEV(mddev,rdev,tmp2) { ! sz += sprintf(page + sz, " %s[%d]", ! partition_name(rdev->dev), rdev->desc_nr); if (rdev->faulty) { ! sz += sprintf(page + sz, "(F)"); ! continue; } - size += rdev->size; } if (mddev->nb_dev) { ! if (mddev->pers) ! sz += sprintf(page + sz, " %d blocks", ! md_size[mdidx(mddev)]); ! else ! sz += sprintf(page + sz, " %d blocks", size); ! } ! ! if (!mddev->pers) { ! sz += sprintf(page+sz, "\n"); ! continue; } ! sz += mddev->pers->status (page+sz, mddev); ! ! if (mddev->curr_resync) ! sz += status_resync (page+sz, mddev); ! else { ! if (md_atomic_read(&mddev->resync_sem.count) != 1) ! sz += sprintf(page + sz, " resync=DELAYED"); } - sz += sprintf(page + sz, "\n"); } - sz += status_unused (page + sz); - return (sz); } --- 3187,3273 ---- */ et = 0; ! sz += sprintf(page+sz, " finish=%u.%umin\n", et / 60, (et % 60)/6); return sz; } int get_md_status (char *page) { ! int sz=0, j, size, mdnum; ! struct md_list_head *tmp; mdk_rdev_t *rdev; mddev_t *mddev; + char str[256]; ! sz += sprintf(page+sz, "%-30s%d\n", "md_stat_version", 100); ! sz += sprintf(page+sz, "%-30s", "md_known_personalities"); for (j = 0; j < MAX_PERSONALITY; j++) ! if (pers[j]) ! sz += sprintf(page+sz, "%s ", pers[j]->name); sz += sprintf(page+sz, "\n"); ! sz += sprintf(page+sz, "%-30s", "md_read_ahead_sectors"); if (read_ahead[MD_MAJOR] == INT_MAX) ! sz += sprintf(page+sz, "\n"); // tjw: does this mean == 0 ?? else ! sz += sprintf(page+sz, "%d\n", read_ahead[MD_MAJOR]); ! ! // sz += sprintf(page+sz, "%-30s%s\n", "md_device_list", "0 1 2... (todo)"); ! // but HOW ??? ! ! sz += status_unused (page+sz); ! ITERATE_MDDEV(mddev,tmp) { ! mdnum=mdidx(mddev); ! sprintf(str,"md_state.%d",mdnum); ! sz += sprintf(page+sz, "%-30s%sactive", str, ! mddev->pers ? "" : "in"); ! if (mddev->pers){ ! sz += sprintf(page+sz, " (%s)", mddev->ro ? "ro":"rw"); ! } ! sz += sprintf(page+sz,"\n"); ! if (mddev->pers) { ! sprintf(str,"md_type.%d",mdnum); ! sz += sprintf(page+sz, "%-30s%s\n", str, mddev->pers->name); ! } ! ! if (mddev->nb_dev) { ! sprintf(str, "md_disk_count.%d", mdnum); ! sz += sprintf(page+sz, "%-30s%d\n", str, mddev->nb_dev); ! sprintf(str, "md_disk_list.%d", mdnum); ! sz += sprintf(page+sz, "%-30s", str); ! for (j = 0; j < mddev->nb_dev; j++) ! sz += sprintf(page+sz, "%d ", j); ! sz += sprintf(page+sz, "\n"); } size = 0; ! ITERATE_RDEV_ORDERED(mddev,rdev,j) { ! sprintf(str,"md_disk_state.%d.%d",mdnum,rdev->desc_nr); ! sz += sprintf(page+sz, "%-30s%s", str, ! partition_name(rdev->dev)); if (rdev->faulty) { ! sz += sprintf(page+sz, " (faulty)\n"); ! }else{ ! sz += sprintf(page+sz," (OK)\n"); ! size += rdev->size; } } if (mddev->nb_dev) { ! sprintf(str, "md_size_in_blocks.%d", mdnum); ! sz += sprintf(page+sz, "%-30s%d\n", str, ! mddev->pers ? md_size[mdnum] : size); } ! if (mddev->pers) { ! sz += mddev->pers->status (page+sz, mddev); ! sz += status_resync(page+sz, mddev); } } return (sz); } *************** *** 3547,3553 **** /* * This is the normal 'everything went OK' case ! * do a 'free-behind' logic, we sure dont need * this buffer if it was the only user. */ for (i = 0; i < chunk; i++) --- 3570,3576 ---- /* * This is the normal 'everything went OK' case ! * do a 'free-behind' logic, we sure don't need * this buffer if it was the only user. */ for (i = 0; i < chunk; i++) *************** *** 3660,3666 **** * the amount of foolproofing might seem to be a tad excessive, but an * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs * of my root partition with the first 0.5 gigs of my /home partition ... so ! * i'm a bit nervous ;) */ void md_do_recovery (void *data) { --- 3683,3689 ---- * the amount of foolproofing might seem to be a tad excessive, but an * early (not so error-safe) version of raid1syncd synced the first 0.5 gigs * of my root partition with the first 0.5 gigs of my /home partition ... so ! * I'm a bit nervous ;) */ void md_do_recovery (void *data) { *************** *** 3732,3738 **** if (!disk_faulty(spare)) { /* * the SPARE_ACTIVE diskop possibly changes the ! * pointer too */ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); mark_disk_sync(spare); --- 3755,3761 ---- if (!disk_faulty(spare)) { /* * the SPARE_ACTIVE diskop possibly changes the ! * pointer, too */ mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); mark_disk_sync(spare); diff -c -r linux-2.2.13-RAID/drivers/block/raid0.c linux-2.2.13-RAID-tw1/drivers/block/raid0.c *** linux-2.2.13-RAID/drivers/block/raid0.c Mon Nov 8 01:28:27 1999 --- linux-2.2.13-RAID-tw1/drivers/block/raid0.c Thu Oct 28 04:36:29 1999 *************** *** 280,294 **** static int raid0_status (char *page, mddev_t *mddev) { ! int sz = 0; #undef MD_DEBUG #ifdef MD_DEBUG int j, k; raid0_conf_t *conf = mddev_to_conf(mddev); ! ! sz += sprintf(page + sz, " "); for (j = 0; j < conf->nr_zones; j++) { ! sz += sprintf(page + sz, "[z%d", conf->hash_table[j].zone0 - conf->strip_zone); if (conf->hash_table[j].zone1) sz += sprintf(page+sz, "/z%d] ", --- 280,301 ---- static int raid0_status (char *page, mddev_t *mddev) { ! int sz = 0, mdnum; ! char str[256]; ! #undef MD_DEBUG #ifdef MD_DEBUG int j, k; raid0_conf_t *conf = mddev_to_conf(mddev); ! #endif ! ! mdnum=mdidx(mddev); ! ! #ifdef MD_DEBUG ! sprintf(str, "md_debug_zones.%d", mdnum); ! sz += sprintf(page + sz, "%-30s", str); for (j = 0; j < conf->nr_zones; j++) { ! sz += sprintf(page+sz, "[z%d", conf->hash_table[j].zone0 - conf->strip_zone); if (conf->hash_table[j].zone1) sz += sprintf(page+sz, "/z%d] ", *************** *** 296,306 **** else sz += sprintf(page+sz, "] "); } ! ! sz += sprintf(page + sz, "\n"); ! for (j = 0; j < conf->nr_strip_zones; j++) { ! sz += sprintf(page + sz, " z%d=[", j); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) sz += sprintf (page+sz, "%s/", partition_name( conf->strip_zone[j].dev[k]->dev)); --- 303,313 ---- else sz += sprintf(page+sz, "] "); } ! sz += sprintf(page+sz, "\n"); ! for (j = 0; j < conf->nr_strip_zones; j++) { ! sprintf(str, "md_debug_strip_zones.%d.z%d", mdnum, j); ! sz += sprintf(page+sz, "-%30s[", str); for (k = 0; k < conf->strip_zone[j].nb_dev; k++) sz += sprintf (page+sz, "%s/", partition_name( conf->strip_zone[j].dev[k]->dev)); *************** *** 311,317 **** conf->strip_zone[j].size); } #endif ! sz += sprintf(page + sz, " %dk chunks", mddev->param.chunk_size/1024); return sz; } --- 318,325 ---- conf->strip_zone[j].size); } #endif ! sprintf(str, "md_chunksize_kb.%d", mdnum); ! sz += sprintf(page+sz, "%-30s%d\n", str, mddev->param.chunk_size/1024); return sz; } diff -c -r linux-2.2.13-RAID/drivers/block/raid1.c linux-2.2.13-RAID-tw1/drivers/block/raid1.c *** linux-2.2.13-RAID/drivers/block/raid1.c Mon Nov 8 01:28:27 1999 --- linux-2.2.13-RAID-tw1/drivers/block/raid1.c Mon Nov 8 01:50:06 1999 *************** *** 169,175 **** } /* ! * This routine checks if the undelying device is an md device * and in that case it maps the blocks before putting the * request on the queue */ --- 169,175 ---- } /* ! * This routine checks if the underlying device is an md device * and in that case it maps the blocks before putting the * request on the queue */ *************** *** 296,302 **** * We should use a private pool (size depending on NR_REQUEST), * to avoid writes filling up the memory with bhs * ! * Such pools are much faster than kmalloc anyways (so we waste * almost nothing by not using the master bh when writing and * win alot of cleanness) but for now we are cool enough. --mingo * --- 296,302 ---- * We should use a private pool (size depending on NR_REQUEST), * to avoid writes filling up the memory with bhs * ! * Such pools are much faster than kmalloc anyway (so we waste * almost nothing by not using the master bh when writing and * win alot of cleanness) but for now we are cool enough. --mingo * *************** *** 330,336 **** md_atomic_set(&r1_bh->remaining, sum_bhs); /* ! * We have to be a bit careful about the semaphore above, thats * why we start the requests separately. Since kmalloc() could * fail, sleep and make_request() can sleep too, this is the * safer solution. Imagine, end_request decreasing the semaphore --- 330,336 ---- md_atomic_set(&r1_bh->remaining, sum_bhs); /* ! * We have to be a bit careful about the semaphore above, that's * why we start the requests separately. Since kmalloc() could * fail, sleep and make_request() can sleep too, this is the * safer solution. Imagine, end_request decreasing the semaphore *************** *** 350,363 **** static int raid1_status (char *page, mddev_t *mddev) { raid1_conf_t *conf = mddev_to_conf(mddev); ! int sz = 0, i; ! ! sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, ! conf->working_disks); for (i = 0; i < conf->raid_disks; i++) ! sz += sprintf (page+sz, "%s", ! conf->mirrors[i].operational ? "U" : "_"); ! sz += sprintf (page+sz, "]"); return sz; } --- 350,377 ---- static int raid1_status (char *page, mddev_t *mddev) { raid1_conf_t *conf = mddev_to_conf(mddev); ! mdp_super_t *sb = mddev->sb; ! int sz = 0, i, mdnum; ! char str[256]; ! ! mdnum=mdidx(mddev); ! sprintf(str, "md_level.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, sb->level); ! sprintf(str, "md_algorithm.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, sb->layout); ! sprintf(str, "md_chunksize_kb.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, sb->chunk_size >> 10); ! sprintf(str, "md_raid_disks.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, conf->raid_disks); ! sprintf(str, "md_working_disks.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, conf->working_disks); ! ! sprintf(str, "md_working_disk_map.%d",mdnum); ! sz += sprintf (page+sz, "%-30s", str); for (i = 0; i < conf->raid_disks; i++) ! sz += sprintf (page+sz, "%s", conf->mirrors[i].operational ? "+" : "-"); ! sz += sprintf (page+sz, "\n"); ! return sz; } *************** *** 608,614 **** break; /* * Activate (mark read-write) the (now sync) spare disk, ! * which means we switch it's 'raid position' (->raid_disk) * with the failed disk. (only the first 'conf->nr_disks' * slots are used for 'real' disks and we must preserve this * property) --- 622,628 ---- break; /* * Activate (mark read-write) the (now sync) spare disk, ! * which means we switch its 'raid position' (->raid_disk) * with the failed disk. (only the first 'conf->nr_disks' * slots are used for 'real' disks and we must preserve this * property) diff -c -r linux-2.2.13-RAID/drivers/block/raid5.c linux-2.2.13-RAID-tw1/drivers/block/raid5.c *** linux-2.2.13-RAID/drivers/block/raid5.c Mon Nov 8 01:28:27 1999 --- linux-2.2.13-RAID-tw1/drivers/block/raid5.c Mon Nov 8 01:52:56 1999 *************** *** 1762,1774 **** { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; mdp_super_t *sb = mddev->sb; ! int sz = 0, i; ! sz += sprintf (page+sz, " level %d, %dk chunk, algorithm %d", sb->level, sb->chunk_size >> 10, sb->layout); ! sz += sprintf (page+sz, " [%d/%d] [", conf->raid_disks, conf->working_disks); for (i = 0; i < conf->raid_disks; i++) ! sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "U" : "_"); ! sz += sprintf (page+sz, "]"); return sz; } --- 1762,1787 ---- { raid5_conf_t *conf = (raid5_conf_t *) mddev->private; mdp_super_t *sb = mddev->sb; ! int sz = 0, i, mdnum; ! char str[256]; ! ! mdnum=mdidx(mddev); ! sprintf(str, "md_level.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, sb->level); ! sprintf(str, "md_algorithm.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, sb->layout); ! sprintf(str, "md_chunksize_kb.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, sb->chunk_size >> 10); ! sprintf(str, "md_raid_disks.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, conf->raid_disks); ! sprintf(str, "md_working_disks.%d",mdnum); ! sz += sprintf (page+sz, "%-30s%d\n", str, conf->working_disks); ! sprintf(str, "md_working_disk_map.%d",mdnum); ! sz += sprintf (page+sz, "%-30s", str); for (i = 0; i < conf->raid_disks; i++) ! sz += sprintf (page+sz, "%s", conf->disks[i].operational ? "+" : "-"); ! sz += sprintf (page+sz, "\n"); return sz; }