diff --git a/CREDITS b/CREDITS
index a347520..1b4f869 100644
--- a/CREDITS
+++ b/CREDITS
@@ -611,8 +611,7 @@ S: USA
N: Randolph Chung
E: tausq@debian.org
D: Linux/PA-RISC hacker
-S: Los Altos, CA 94022
-S: USA
+S: Hong Kong
N: Juan Jose Ciarlante
W: http://juanjox.kernelnotes.org/
@@ -1097,7 +1096,7 @@ S: 80050-430 - Curitiba - Paraná
S: Brazil
N: Kumar Gala
-E: kumar.gala@freescale.com
+E: galak@kernel.crashing.org
D: Embedded PowerPC 6xx/7xx/74xx/82xx/83xx/85xx support
S: Austin, Texas 78729
S: USA
@@ -2247,6 +2246,12 @@ S: 249 Nichols Avenue
S: Syracuse, New York 13206
S: USA
+N: Kyle McMartin
+E: kyle@parisc-linux.org
+D: Linux/PARISC hacker
+D: AD1889 sound driver
+S: Ottawa, Canada
+
N: Dirk Melchers
E: dirk@merlin.nbg.sub.org
D: 8 bit XT hard disk driver for OMTI5520
@@ -3399,6 +3404,15 @@ S: Chudenicka 8
S: 10200 Prague 10, Hostivar
S: Czech Republic
+N: Thibaut Varene
+E: T-Bone@parisc-linux.org
+W: http://www.parisc-linux.org/
+P: 1024D/B7D2F063 E67C 0D43 A75E 12A5 BB1C FA2F 1E32 C3DA B7D2 F063
+D: PA-RISC port minion, PDC and GSCPS2 drivers, debuglocks and other bits
+D: Some bits in an ARM port, S1D13XXX FB driver, random patches here and there
+D: AD1889 sound driver
+S: Paris, France
+
N: Heikki Vatiainen
E: hessu@cs.tut.fi
D: Co-author of Multi-Protocol Over ATM (MPOA), some LANE hacks
@@ -3636,11 +3650,9 @@ S: Beaverton, OR 97005
S: USA
N: Michal Wronski
-E: wrona@mat.uni.torun.pl
-W: http://www.mat.uni.torun.pl/~wrona
+E: Michal.Wronski@motorola.com
D: POSIX message queues fs (with K. Benedyczak)
-S: ul. Teczowa 23/12
-S: 80-680 Gdansk-Sobieszewo
+S: Krakow
S: Poland
N: Frank Xia
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 433cf5e..5f7f7d7 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -24,6 +24,8 @@ DMA-mapping.txt
- info for PCI drivers using DMA portably across all platforms.
DocBook/
- directory with DocBook templates etc. for kernel documentation.
+HOWTO
+ - The process and procedures of how to do Linux kernel development.
IO-mapping.txt
- how to access I/O mapped memory from within device drivers.
IPMI.txt
@@ -256,6 +258,10 @@ specialix.txt
- info on hardware/driver for specialix IO8+ multiport serial card.
spinlocks.txt
- info on using spinlocks to provide exclusive access in kernel.
+stable_api_nonsense.txt
+ - info on why the kernel does not have a stable in-kernel api or abi.
+stable_kernel_rules.txt
+ - rules and procedures for the -stable kernel releases.
stallion.txt
- info on using the Stallion multiport serial driver.
svga.txt
diff --git a/Documentation/Changes b/Documentation/Changes
index 27232be..86b8639 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -65,7 +65,7 @@ o isdn4k-utils 3.1pre1
o nfs-utils 1.0.5 # showmount --version
o procps 3.2.0 # ps --version
o oprofile 0.9 # oprofiled --version
-o udev 058 # udevinfo -V
+o udev 071 # udevinfo -V
Kernel compilation
==================
@@ -139,9 +139,14 @@ You'll probably want to upgrade.
Ksymoops
--------
-If the unthinkable happens and your kernel oopses, you'll need a 2.4
-version of ksymoops to decode the report; see REPORTING-BUGS in the
-root of the Linux source for more information.
+If the unthinkable happens and your kernel oopses, you may need the
+ksymoops tool to decode it, but in most cases you don't.
+In the 2.6 kernel it is generally preferred to build the kernel with
+CONFIG_KALLSYMS so that it produces readable dumps that can be used as-is
+(this also produces better output than ksymoops).
+If for some reason your kernel is not build with CONFIG_KALLSYMS and
+you have no way to rebuild and reproduce the Oops with that option, then
+you can still decode that Oops with ksymoops.
Module-Init-Tools
-----------------
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index fa3e29a..1c95588 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -10,7 +10,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mc
kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
procfs-guide.xml writing_usb_driver.xml \
sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \
- gadget.xml libata.xml mtdnand.xml librs.xml
+ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml
###
# The build process is as follows (targets):
@@ -20,6 +20,12 @@ DOCBOOKS := wanbook.xml z8530book.xml mc
# +--> DIR=file (htmldocs)
# +--> man/ (mandocs)
+
+# for PDF and PS output you can choose between xmlto and docbook-utils tools
+PDF_METHOD = $(prefer-db2x)
+PS_METHOD = $(prefer-db2x)
+
+
###
# The targets that may be used.
.PHONY: xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs
@@ -93,27 +99,39 @@ C-procfs-example = procfs_example.xml
C-procfs-example2 = $(addprefix $(obj)/,$(C-procfs-example))
$(obj)/procfs-guide.xml: $(C-procfs-example2)
-###
-# Rules to generate postscript, PDF and HTML
-# db2html creates a directory. Generate a html file used for timestamp
+notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \
+ exit 1
+db2xtemplate = db2TYPE -o $(dir $@) $<
+xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $<
+
+# determine which methods are available
+ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found)
+ use-db2x = db2x
+ prefer-db2x = db2x
+else
+ use-db2x = notfound
+ prefer-db2x = $(use-xmlto)
+endif
+ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found)
+ use-xmlto = xmlto
+ prefer-xmlto = xmlto
+else
+ use-xmlto = notfound
+ prefer-xmlto = $(use-db2x)
+endif
-quiet_cmd_db2ps = XMLTO $@
- cmd_db2ps = xmlto ps $(XMLTOFLAGS) -o $(dir $@) $<
+# the commands, generated from the chosen template
+quiet_cmd_db2ps = PS $@
+ cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template))
%.ps : %.xml
- @(which xmlto > /dev/null 2>&1) || \
- (echo "*** You need to install xmlto ***"; \
- exit 1)
$(call cmd,db2ps)
-quiet_cmd_db2pdf = XMLTO $@
- cmd_db2pdf = xmlto pdf $(XMLTOFLAGS) -o $(dir $@) $<
+quiet_cmd_db2pdf = PDF $@
+ cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template))
%.pdf : %.xml
- @(which xmlto > /dev/null 2>&1) || \
- (echo "*** You need to install xmlto ***"; \
- exit 1)
$(call cmd,db2pdf)
-quiet_cmd_db2html = XMLTO $@
+quiet_cmd_db2html = HTML $@
cmd_db2html = xmlto xhtml $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \
echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \
Goto $(patsubst %.html,%,$(notdir $@))</a><p>' > $@
@@ -127,7 +145,7 @@ quiet_cmd_db2html = XMLTO $@
@if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
-quiet_cmd_db2man = XMLTO $@
+quiet_cmd_db2man = MAN $@
cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man $< ; gzip -f $(obj)/man/*.9; fi
%.9 : %.xml
@(which xmlto > /dev/null 2>&1) || \
diff --git a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl
index 341aaa4..2077f9a 100644
--- a/Documentation/DocBook/journal-api.tmpl
+++ b/Documentation/DocBook/journal-api.tmpl
@@ -306,7 +306,7 @@ an example.
</para>
<sect1><title>Journal Level</title>
!Efs/jbd/journal.c
-!Efs/jbd/recovery.c
+!Ifs/jbd/recovery.c
</sect1>
<sect1><title>Transasction Level</title>
!Efs/jbd/transaction.c
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index d650ce3..767433b 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -68,9 +68,7 @@ X!Iinclude/linux/kobject.h
<sect1><title>Kernel utility functions</title>
!Iinclude/linux/kernel.h
-<!-- This needs to clean up to make kernel-doc happy
-X!Ekernel/printk.c
- -->
+!Ekernel/printk.c
!Ekernel/panic.c
!Ekernel/sys.c
!Ekernel/rcupdate.c
@@ -118,7 +116,7 @@ X!Ilib/string.c
</sect1>
<sect1><title>User Space Memory Access</title>
!Iinclude/asm-i386/uaccess.h
-!Iarch/i386/lib/usercopy.c
+!Earch/i386/lib/usercopy.c
</sect1>
<sect1><title>More Memory Management Functions</title>
!Iinclude/linux/rmap.h
@@ -174,7 +172,6 @@ X!Ilib/string.c
<title>The Linux VFS</title>
<sect1><title>The Filesystem types</title>
!Iinclude/linux/fs.h
-!Einclude/linux/fs.h
</sect1>
<sect1><title>The Directory Cache</title>
!Efs/dcache.c
@@ -239,9 +236,11 @@ X!Ilib/string.c
<title>Network device support</title>
<sect1><title>Driver Support</title>
!Enet/core/dev.c
- </sect1>
- <sect1><title>8390 Based Network Cards</title>
-!Edrivers/net/8390.c
+!Enet/ethernet/eth.c
+!Iinclude/linux/etherdevice.h
+<!-- FIXME: Removed for now since no structured comments in source
+X!Enet/core/wireless.c
+-->
</sect1>
<sect1><title>Synchronous PPP</title>
!Edrivers/net/wan/syncppp.c
@@ -266,7 +265,7 @@ X!Ekernel/module.c
<chapter id="hardware">
<title>Hardware Interfaces</title>
<sect1><title>Interrupt Handling</title>
-!Ikernel/irq/manage.c
+!Ekernel/irq/manage.c
</sect1>
<sect1><title>Resources Management</title>
@@ -286,7 +285,9 @@ X!Edrivers/pci/search.c
-->
!Edrivers/pci/msi.c
!Edrivers/pci/bus.c
-!Edrivers/pci/hotplug.c
+<!-- FIXME: Removed for now since no structured comments in source
+X!Edrivers/pci/hotplug.c
+-->
!Edrivers/pci/probe.c
!Edrivers/pci/rom.c
</sect1>
@@ -387,7 +388,7 @@ X!Edrivers/pnp/system.c
<chapter id="blkdev">
<title>Block Devices</title>
-!Edrivers/block/ll_rw_blk.c
+!Eblock/ll_rw_blk.c
</chapter>
<chapter id="miscdev">
@@ -499,7 +500,7 @@ KAO -->
!Edrivers/video/modedb.c
</sect1>
<sect1><title>Frame Buffer Macintosh Video Mode Database</title>
-!Idrivers/video/macmodes.c
+!Edrivers/video/macmodes.c
</sect1>
<sect1><title>Frame Buffer Fonts</title>
<para>
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
index 375ae76..d260d92 100644
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -415,6 +415,362 @@ and other resources, etc.
</sect1>
</chapter>
+ <chapter id="libataEH">
+ <title>Error handling</title>
+
+ <para>
+ This chapter describes how errors are handled under libata.
+ Readers are advised to read SCSI EH
+ (Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first.
+ </para>
+
+ <sect1><title>Origins of commands</title>
+ <para>
+ In libata, a command is represented with struct ata_queued_cmd
+ or qc. qc's are preallocated during port initialization and
+ repetitively used for command executions. Currently only one
+ qc is allocated per port but yet-to-be-merged NCQ branch
+ allocates one for each tag and maps each qc to NCQ tag 1-to-1.
+ </para>
+ <para>
+ libata commands can originate from two sources - libata itself
+ and SCSI midlayer. libata internal commands are used for
+ initialization and error handling. All normal blk requests
+ and commands for SCSI emulation are passed as SCSI commands
+ through queuecommand callback of SCSI host template.
+ </para>
+ </sect1>
+
+ <sect1><title>How commands are issued</title>
+
+ <variablelist>
+
+ <varlistentry><term>Internal commands</term>
+ <listitem>
+ <para>
+ First, qc is allocated and initialized using
+ ata_qc_new_init(). Although ata_qc_new_init() doesn't
+ implement any wait or retry mechanism when qc is not
+ available, internal commands are currently issued only during
+ initialization and error recovery, so no other command is
+ active and allocation is guaranteed to succeed.
+ </para>
+ <para>
+ Once allocated qc's taskfile is initialized for the command to
+ be executed. qc currently has two mechanisms to notify
+ completion. One is via qc->complete_fn() callback and the
+ other is completion qc->waiting. qc->complete_fn() callback
+ is the asynchronous path used by normal SCSI translated
+ commands and qc->waiting is the synchronous (issuer sleeps in
+ process context) path used by internal commands.
+ </para>
+ <para>
+ Once initialization is complete, host_set lock is acquired
+ and the qc is issued.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>SCSI commands</term>
+ <listitem>
+ <para>
+ All libata drivers use ata_scsi_queuecmd() as
+ hostt->queuecommand callback. scmds can either be simulated
+ or translated. No qc is involved in processing a simulated
+ scmd. The result is computed right away and the scmd is
+ completed.
+ </para>
+ <para>
+ For a translated scmd, ata_qc_new_init() is invoked to
+ allocate a qc and the scmd is translated into the qc. SCSI
+ midlayer's completion notification function pointer is stored
+ into qc->scsidone.
+ </para>
+ <para>
+ qc->complete_fn() callback is used for completion
+ notification. ATA commands use ata_scsi_qc_complete() while
+ ATAPI commands use atapi_qc_complete(). Both functions end up
+ calling qc->scsidone to notify upper layer when the qc is
+ finished. After translation is completed, the qc is issued
+ with ata_qc_issue().
+ </para>
+ <para>
+ Note that SCSI midlayer invokes hostt->queuecommand while
+ holding host_set lock, so all above occur while holding
+ host_set lock.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+ </sect1>
+
+ <sect1><title>How commands are processed</title>
+ <para>
+ Depending on which protocol and which controller are used,
+ commands are processed differently. For the purpose of
+ discussion, a controller which uses taskfile interface and all
+ standard callbacks is assumed.
+ </para>
+ <para>
+ Currently 6 ATA command protocols are used. They can be
+ sorted into the following four categories according to how
+ they are processed.
+ </para>
+
+ <variablelist>
+ <varlistentry><term>ATA NO DATA or DMA</term>
+ <listitem>
+ <para>
+ ATA_PROT_NODATA and ATA_PROT_DMA fall into this category.
+ These types of commands don't require any software
+ intervention once issued. Device will raise interrupt on
+ completion.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATA PIO</term>
+ <listitem>
+ <para>
+ ATA_PROT_PIO is in this category. libata currently
+ implements PIO with polling. ATA_NIEN bit is set to turn
+ off interrupt and pio_task on ata_wq performs polling and
+ IO.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATAPI NODATA or DMA</term>
+ <listitem>
+ <para>
+ ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this
+ category. packet_task is used to poll BSY bit after
+ issuing PACKET command. Once BSY is turned off by the
+ device, packet_task transfers CDB and hands off processing
+ to interrupt handler.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATAPI PIO</term>
+ <listitem>
+ <para>
+ ATA_PROT_ATAPI is in this category. ATA_NIEN bit is set
+ and, as in ATAPI NODATA or DMA, packet_task submits cdb.
+ However, after submitting cdb, further processing (data
+ transfer) is handed off to pio_task.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </sect1>
+
+ <sect1><title>How commands are completed</title>
+ <para>
+ Once issued, all qc's are either completed with
+ ata_qc_complete() or time out. For commands which are handled
+ by interrupts, ata_host_intr() invokes ata_qc_complete(), and,
+ for PIO tasks, pio_task invokes ata_qc_complete(). In error
+ cases, packet_task may also complete commands.
+ </para>
+ <para>
+ ata_qc_complete() does the following.
+ </para>
+
+ <orderedlist>
+
+ <listitem>
+ <para>
+ DMA memory is unmapped.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ ATA_QCFLAG_ACTIVE is clared from qc->flags.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ qc->complete_fn() callback is invoked. If the return value of
+ the callback is not zero. Completion is short circuited and
+ ata_qc_complete() returns.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ __ata_qc_complete() is called, which does
+ <orderedlist>
+
+ <listitem>
+ <para>
+ qc->flags is cleared to zero.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ ap->active_tag and qc->tag are poisoned.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ qc->waiting is claread & completed (in that order).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ qc is deallocated by clearing appropriate bit in ap->qactive.
+ </para>
+ </listitem>
+
+ </orderedlist>
+ </para>
+ </listitem>
+
+ </orderedlist>
+
+ <para>
+ So, it basically notifies upper layer and deallocates qc. One
+ exception is short-circuit path in #3 which is used by
+ atapi_qc_complete().
+ </para>
+ <para>
+ For all non-ATAPI commands, whether it fails or not, almost
+ the same code path is taken and very little error handling
+ takes place. A qc is completed with success status if it
+ succeeded, with failed status otherwise.
+ </para>
+ <para>
+ However, failed ATAPI commands require more handling as
+ REQUEST SENSE is needed to acquire sense data. If an ATAPI
+ command fails, ata_qc_complete() is invoked with error status,
+ which in turn invokes atapi_qc_complete() via
+ qc->complete_fn() callback.
+ </para>
+ <para>
+ This makes atapi_qc_complete() set scmd->result to
+ SAM_STAT_CHECK_CONDITION, complete the scmd and return 1. As
+ the sense data is empty but scmd->result is CHECK CONDITION,
+ SCSI midlayer will invoke EH for the scmd, and returning 1
+ makes ata_qc_complete() to return without deallocating the qc.
+ This leads us to ata_scsi_error() with partially completed qc.
+ </para>
+
+ </sect1>
+
+ <sect1><title>ata_scsi_error()</title>
+ <para>
+ ata_scsi_error() is the current hostt->eh_strategy_handler()
+ for libata. As discussed above, this will be entered in two
+ cases - timeout and ATAPI error completion. This function
+ calls low level libata driver's eng_timeout() callback, the
+ standard callback for which is ata_eng_timeout(). It checks
+ if a qc is active and calls ata_qc_timeout() on the qc if so.
+ Actual error handling occurs in ata_qc_timeout().
+ </para>
+ <para>
+ If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and
+ completes the qc. Note that as we're currently in EH, we
+ cannot call scsi_done. As described in SCSI EH doc, a
+ recovered scmd should be either retried with
+ scsi_queue_insert() or finished with scsi_finish_command().
+ Here, we override qc->scsidone with scsi_finish_command() and
+ calls ata_qc_complete().
+ </para>
+ <para>
+ If EH is invoked due to a failed ATAPI qc, the qc here is
+ completed but not deallocated. The purpose of this
+ half-completion is to use the qc as place holder to make EH
+ code reach this place. This is a bit hackish, but it works.
+ </para>
+ <para>
+ Once control reaches here, the qc is deallocated by invoking
+ __ata_qc_complete() explicitly. Then, internal qc for REQUEST
+ SENSE is issued. Once sense data is acquired, scmd is
+ finished by directly invoking scsi_finish_command() on the
+ scmd. Note that as we already have completed and deallocated
+ the qc which was associated with the scmd, we don't need
+ to/cannot call ata_qc_complete() again.
+ </para>
+
+ </sect1>
+
+ <sect1><title>Problems with the current EH</title>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ Error representation is too crude. Currently any and all
+ error conditions are represented with ATA STATUS and ERROR
+ registers. Errors which aren't ATA device errors are treated
+ as ATA device errors by setting ATA_ERR bit. Better error
+ descriptor which can properly represent ATA and other
+ errors/exceptions is needed.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ When handling timeouts, no action is taken to make device
+ forget about the timed out command and ready for new commands.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ EH handling via ata_scsi_error() is not properly protected
+ from usual command processing. On EH entrance, the device is
+ not in quiescent state. Timed out commands may succeed or
+ fail any time. pio_task and atapi_task may still be running.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Too weak error recovery. Devices / controllers causing HSM
+ mismatch errors and other errors quite often require reset to
+ return to known state. Also, advanced error handling is
+ necessary to support features like NCQ and hotplug.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ ATA errors are directly handled in the interrupt handler and
+ PIO errors in pio_task. This is problematic for advanced
+ error handling for the following reasons.
+ </para>
+ <para>
+ First, advanced error handling often requires context and
+ internal qc execution.
+ </para>
+ <para>
+ Second, even a simple failure (say, CRC error) needs
+ information gathering and could trigger complex error handling
+ (say, resetting & reconfiguring). Having multiple code
+ paths to gather information, enter EH and trigger actions
+ makes life painful.
+ </para>
+ <para>
+ Third, scattered EH code makes implementing low level drivers
+ difficult. Low level drivers override libata callbacks. If
+ EH is scattered over several places, each affected callbacks
+ should perform its part of error handling. This can be error
+ prone and painful.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+ </sect1>
+ </chapter>
+
<chapter id="libataExt">
<title>libata Library</title>
!Edrivers/scsi/libata-core.c
@@ -431,6 +787,722 @@ and other resources, etc.
!Idrivers/scsi/libata-scsi.c
</chapter>
+ <chapter id="ataExceptions">
+ <title>ATA errors & exceptions</title>
+
+ <para>
+ This chapter tries to identify what error/exception conditions exist
+ for ATA/ATAPI devices and describe how they should be handled in
+ implementation-neutral way.
+ </para>
+
+ <para>
+ The term 'error' is used to describe conditions where either an
+ explicit error condition is reported from device or a command has
+ timed out.
+ </para>
+
+ <para>
+ The term 'exception' is either used to describe exceptional
+ conditions which are not errors (say, power or hotplug events), or
+ to describe both errors and non-error exceptional conditions. Where
+ explicit distinction between error and exception is necessary, the
+ term 'non-error exception' is used.
+ </para>
+
+ <sect1 id="excat">
+ <title>Exception categories</title>
+ <para>
+ Exceptions are described primarily with respect to legacy
+ taskfile + bus master IDE interface. If a controller provides
+ other better mechanism for error reporting, mapping those into
+ categories described below shouldn't be difficult.
+ </para>
+
+ <para>
+ In the following sections, two recovery actions - reset and
+ reconfiguring transport - are mentioned. These are described
+ further in <xref linkend="exrec"/>.
+ </para>
+
+ <sect2 id="excatHSMviolation">
+ <title>HSM violation</title>
+ <para>
+ This error is indicated when STATUS value doesn't match HSM
+ requirement during issuing or excution any ATA/ATAPI command.
+ </para>
+
+ <itemizedlist>
+ <title>Examples</title>
+
+ <listitem>
+ <para>
+ ATA_STATUS doesn't contain !BSY && DRDY && !DRQ while trying
+ to issue a command.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY && !DRQ during PIO data transfer.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ DRQ on command completion.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY && ERR after CDB tranfer starts but before the
+ last byte of CDB is transferred. ATA/ATAPI standard states
+ that "The device shall not terminate the PACKET command
+ with an error before the last byte of the command packet has
+ been written" in the error outputs description of PACKET
+ command and the state diagram doesn't include such
+ transitions.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ In these cases, HSM is violated and not much information
+ regarding the error can be acquired from STATUS or ERROR
+ register. IOW, this error can be anything - driver bug,
+ faulty device, controller and/or cable.
+ </para>
+
+ <para>
+ As HSM is violated, reset is necessary to restore known state.
+ Reconfiguring transport for lower speed might be helpful too
+ as transmission errors sometimes cause this kind of errors.
+ </para>
+ </sect2>
+
+ <sect2 id="excatDevErr">
+ <title>ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)</title>
+
+ <para>
+ These are errors detected and reported by ATA/ATAPI devices
+ indicating device problems. For this type of errors, STATUS
+ and ERROR register values are valid and describe error
+ condition. Note that some of ATA bus errors are detected by
+ ATA/ATAPI devices and reported using the same mechanism as
+ device errors. Those cases are described later in this
+ section.
+ </para>
+
+ <para>
+ For ATA commands, this type of errors are indicated by !BSY
+ && ERR during command execution and on completion.
+ </para>
+
+ <para>For ATAPI commands,</para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ !BSY && ERR && ABRT right after issuing PACKET
+ indicates that PACKET command is not supported and falls in
+ this category.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY && ERR(==CHK) && !ABRT after the last
+ byte of CDB is transferred indicates CHECK CONDITION and
+ doesn't fall in this category.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ !BSY && ERR(==CHK) && ABRT after the last byte
+ of CDB is transferred *probably* indicates CHECK CONDITION and
+ doesn't fall in this category.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ Of errors detected as above, the followings are not ATA/ATAPI
+ device errors but ATA bus errors and should be handled
+ according to <xref linkend="excatATAbusErr"/>.
+ </para>
+
+ <variablelist>
+
+ <varlistentry>
+ <term>CRC error during data transfer</term>
+ <listitem>
+ <para>
+ This is indicated by ICRC bit in the ERROR register and
+ means that corruption occurred during data transfer. Upto
+ ATA/ATAPI-7, the standard specifies that this bit is only
+ applicable to UDMA transfers but ATA/ATAPI-8 draft revision
+ 1f says that the bit may be applicable to multiword DMA and
+ PIO.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>ABRT error during data transfer or on completion</term>
+ <listitem>
+ <para>
+ Upto ATA/ATAPI-7, the standard specifies that ABRT could be
+ set on ICRC errors and on cases where a device is not able
+ to complete a command. Combined with the fact that MWDMA
+ and PIO transfer errors aren't allowed to use ICRC bit upto
+ ATA/ATAPI-7, it seems to imply that ABRT bit alone could
+ indicate tranfer errors.
+ </para>
+ <para>
+ However, ATA/ATAPI-8 draft revision 1f removes the part
+ that ICRC errors can turn on ABRT. So, this is kind of
+ gray area. Some heuristics are needed here.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+
+ <para>
+ ATA/ATAPI device errors can be further categorized as follows.
+ </para>
+
+ <variablelist>
+
+ <varlistentry>
+ <term>Media errors</term>
+ <listitem>
+ <para>
+ This is indicated by UNC bit in the ERROR register. ATA
+ devices reports UNC error only after certain number of
+ retries cannot recover the data, so there's nothing much
+ else to do other than notifying upper layer.
+ </para>
+ <para>
+ READ and WRITE commands report CHS or LBA of the first
+ failed sector but ATA/ATAPI standard specifies that the
+ amount of transferred data on error completion is
+ indeterminate, so we cannot assume that sectors preceding
+ the failed sector have been transferred and thus cannot
+ complete those sectors successfully as SCSI does.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>Media changed / media change requested error</term>
+ <listitem>
+ <para>
+ <<TODO: fill here>>
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>Address error</term>
+ <listitem>
+ <para>
+ This is indicated by IDNF bit in the ERROR register.
+ Report to upper layer.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>Other errors</term>
+ <listitem>
+ <para>
+ This can be invalid command or parameter indicated by ABRT
+ ERROR bit or some other error condition. Note that ABRT
+ bit can indicate a lot of things including ICRC and Address
+ errors. Heuristics needed.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+
+ <para>
+ Depending on commands, not all STATUS/ERROR bits are
+ applicable. These non-applicable bits are marked with
+ "na" in the output descriptions but upto ATA/ATAPI-7
+ no definition of "na" can be found. However,
+ ATA/ATAPI-8 draft revision 1f describes "N/A" as
+ follows.
+ </para>
+
+ <blockquote>
+ <variablelist>
+ <varlistentry><term>3.2.3.3a N/A</term>
+ <listitem>
+ <para>
+ A keyword the indicates a field has no defined value in
+ this standard and should not be checked by the host or
+ device. N/A fields should be cleared to zero.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </blockquote>
+
+ <para>
+ So, it seems reasonable to assume that "na" bits are
+ cleared to zero by devices and thus need no explicit masking.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatATAPIcc">
+ <title>ATAPI device CHECK CONDITION</title>
+
+ <para>
+ ATAPI device CHECK CONDITION error is indicated by set CHK bit
+ (ERR bit) in the STATUS register after the last byte of CDB is
+ transferred for a PACKET command. For this kind of errors,
+ sense data should be acquired to gather information regarding
+ the errors. REQUEST SENSE packet command should be used to
+ acquire sense data.
+ </para>
+
+ <para>
+ Once sense data is acquired, this type of errors can be
+ handled similary to other SCSI errors. Note that sense data
+ may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR
+ && ASC/ASCQ 47h/00h SCSI PARITY ERROR). In such
+ cases, the error should be considered as an ATA bus error and
+ handled according to <xref linkend="excatATAbusErr"/>.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatNCQerr">
+ <title>ATA device error (NCQ)</title>
+
+ <para>
+ NCQ command error is indicated by cleared BSY and set ERR bit
+ during NCQ command phase (one or more NCQ commands
+ outstanding). Although STATUS and ERROR registers will
+ contain valid values describing the error, READ LOG EXT is
+ required to clear the error condition, determine which command
+ has failed and acquire more information.
+ </para>
+
+ <para>
+ READ LOG EXT Log Page 10h reports which tag has failed and
+ taskfile register values describing the error. With this
+ information the failed command can be handled as a normal ATA
+ command error as in <xref linkend="excatDevErr"/> and all
+ other in-flight commands must be retried. Note that this
+ retry should not be counted - it's likely that commands
+ retried this way would have completed normally if it were not
+ for the failed command.
+ </para>
+
+ <para>
+ Note that ATA bus errors can be reported as ATA device NCQ
+ errors. This should be handled as described in <xref
+ linkend="excatATAbusErr"/>.
+ </para>
+
+ <para>
+ If READ LOG EXT Log Page 10h fails or reports NQ, we're
+ thoroughly screwed. This condition should be treated
+ according to <xref linkend="excatHSMviolation"/>.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatATAbusErr">
+ <title>ATA bus error</title>
+
+ <para>
+ ATA bus error means that data corruption occurred during
+ transmission over ATA bus (SATA or PATA). This type of errors
+ can be indicated by
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ ICRC or ABRT error as described in <xref linkend="excatDevErr"/>.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Controller-specific error completion with error information
+ indicating transmission error.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ On some controllers, command timeout. In this case, there may
+ be a mechanism to determine that the timeout is due to
+ transmission error.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Unknown/random errors, timeouts and all sorts of weirdities.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ As described above, transmission errors can cause wide variety
+ of symptoms ranging from device ICRC error to random device
+ lockup, and, for many cases, there is no way to tell if an
+ error condition is due to transmission error or not;
+ therefore, it's necessary to employ some kind of heuristic
+ when dealing with errors and timeouts. For example,
+ encountering repetitive ABRT errors for known supported
+ command is likely to indicate ATA bus error.
+ </para>
+
+ <para>
+ Once it's determined that ATA bus errors have possibly
+ occurred, lowering ATA bus transmission speed is one of
+ actions which may alleviate the problem. See <xref
+ linkend="exrecReconf"/> for more information.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatPCIbusErr">
+ <title>PCI bus error</title>
+
+ <para>
+ Data corruption or other failures during transmission over PCI
+ (or other system bus). For standard BMDMA, this is indicated
+ by Error bit in the BMDMA Status register. This type of
+ errors must be logged as it indicates something is very wrong
+ with the system. Resetting host controller is recommended.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatLateCompletion">
+ <title>Late completion</title>
+
+ <para>
+ This occurs when timeout occurs and the timeout handler finds
+ out that the timed out command has completed successfully or
+ with error. This is usually caused by lost interrupts. This
+ type of errors must be logged. Resetting host controller is
+ recommended.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatUnknown">
+ <title>Unknown error (timeout)</title>
+
+ <para>
+ This is when timeout occurs and the command is still
+ processing or the host and device are in unknown state. When
+ this occurs, HSM could be in any valid or invalid state. To
+ bring the device to known state and make it forget about the
+ timed out command, resetting is necessary. The timed out
+ command may be retried.
+ </para>
+
+ <para>
+ Timeouts can also be caused by transmission errors. Refer to
+ <xref linkend="excatATAbusErr"/> for more details.
+ </para>
+
+ </sect2>
+
+ <sect2 id="excatHoplugPM">
+ <title>Hotplug and power management exceptions</title>
+
+ <para>
+ <<TODO: fill here>>
+ </para>
+
+ </sect2>
+
+ </sect1>
+
+ <sect1 id="exrec">
+ <title>EH recovery actions</title>
+
+ <para>
+ This section discusses several important recovery actions.
+ </para>
+
+ <sect2 id="exrecClr">
+ <title>Clearing error condition</title>
+
+ <para>
+ Many controllers require its error registers to be cleared by
+ error handler. Different controllers may have different
+ requirements.
+ </para>
+
+ <para>
+ For SATA, it's strongly recommended to clear at least SError
+ register during error handling.
+ </para>
+ </sect2>
+
+ <sect2 id="exrecRst">
+ <title>Reset</title>
+
+ <para>
+ During EH, resetting is necessary in the following cases.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ HSM is in unknown or invalid state
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ HBA is in unknown or invalid state
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ EH needs to make HBA/device forget about in-flight commands
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ HBA/device behaves weirdly
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ Resetting during EH might be a good idea regardless of error
+ condition to improve EH robustness. Whether to reset both or
+ either one of HBA and device depends on situation but the
+ following scheme is recommended.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ When it's known that HBA is in ready state but ATA/ATAPI
+ device in in unknown state, reset only device.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ If HBA is in unknown state, reset both HBA and device.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ HBA resetting is implementation specific. For a controller
+ complying to taskfile/BMDMA PCI IDE, stopping active DMA
+ transaction may be sufficient iff BMDMA state is the only HBA
+ context. But even mostly taskfile/BMDMA PCI IDE complying
+ controllers may have implementation specific requirements and
+ mechanism to reset themselves. This must be addressed by
+ specific drivers.
+ </para>
+
+ <para>
+ OTOH, ATA/ATAPI standard describes in detail ways to reset
+ ATA/ATAPI devices.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>PATA hardware reset</term>
+ <listitem>
+ <para>
+ This is hardware initiated device reset signalled with
+ asserted PATA RESET- signal. There is no standard way to
+ initiate hardware reset from software although some
+ hardware provides registers that allow driver to directly
+ tweak the RESET- signal.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>Software reset</term>
+ <listitem>
+ <para>
+ This is achieved by turning CONTROL SRST bit on for at
+ least 5us. Both PATA and SATA support it but, in case of
+ SATA, this may require controller-specific support as the
+ second Register FIS to clear SRST should be transmitted
+ while BSY bit is still set. Note that on PATA, this resets
+ both master and slave devices on a channel.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>EXECUTE DEVICE DIAGNOSTIC command</term>
+ <listitem>
+ <para>
+ Although ATA/ATAPI standard doesn't describe exactly, EDD
+ implies some level of resetting, possibly similar level
+ with software reset. Host-side EDD protocol can be handled
+ with normal command processing and most SATA controllers
+ should be able to handle EDD's just like other commands.
+ As in software reset, EDD affects both devices on a PATA
+ bus.
+ </para>
+ <para>
+ Although EDD does reset devices, this doesn't suit error
+ handling as EDD cannot be issued while BSY is set and it's
+ unclear how it will act when device is in unknown/weird
+ state.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>ATAPI DEVICE RESET command</term>
+ <listitem>
+ <para>
+ This is very similar to software reset except that reset
+ can be restricted to the selected device without affecting
+ the other device sharing the cable.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry><term>SATA phy reset</term>
+ <listitem>
+ <para>
+ This is the preferred way of resetting a SATA device. In
+ effect, it's identical to PATA hardware reset. Note that
+ this can be done with the standard SCR Control register.
+ As such, it's usually easier to implement than software
+ reset.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ </variablelist>
+
+ <para>
+ One more thing to consider when resetting devices is that
+ resetting clears certain configuration parameters and they
+ need to be set to their previous or newly adjusted values
+ after reset.
+ </para>
+
+ <para>
+ Parameters affected are.
+ </para>
+
+ <itemizedlist>
+
+ <listitem>
+ <para>
+ CHS set up with INITIALIZE DEVICE PARAMETERS (seldomly used)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Parameters set with SET FEATURES including transfer mode setting
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Block count set with SET MULTIPLE MODE
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Other parameters (SET MAX, MEDIA LOCK...)
+ </para>
+ </listitem>
+
+ </itemizedlist>
+
+ <para>
+ ATA/ATAPI standard specifies that some parameters must be
+ maintained across hardware or software reset, but doesn't
+ strictly specify all of them. Always reconfiguring needed
+ parameters after reset is required for robustness. Note that
+ this also applies when resuming from deep sleep (power-off).
+ </para>
+
+ <para>
+ Also, ATA/ATAPI standard requires that IDENTIFY DEVICE /
+ IDENTIFY PACKET DEVICE is issued after any configuration
+ parameter is updated or a hardware reset and the result used
+ for further operation. OS driver is required to implement
+ revalidation mechanism to support this.
+ </para>
+
+ </sect2>
+
+ <sect2 id="exrecReconf">
+ <title>Reconfigure transport</title>
+
+ <para>
+ For both PATA and SATA, a lot of corners are cut for cheap
+ connectors, cables or controllers and it's quite common to see
+ high transmission error rate. This can be mitigated by
+ lowering transmission speed.
+ </para>
+
+ <para>
+ The following is a possible scheme Jeff Garzik suggested.
+ </para>
+
+ <blockquote>
+ <para>
+ If more than $N (3?) transmission errors happen in 15 minutes,
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ if SATA, decrease SATA PHY speed. if speed cannot be decreased,
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ decrease UDMA xfer speed. if at UDMA0, switch to PIO4,
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ decrease PIO xfer speed. if at PIO3, complain, but continue
+ </para>
+ </listitem>
+ </itemizedlist>
+ </blockquote>
+
+ </sect2>
+
+ </sect1>
+
+ </chapter>
+
<chapter id="PiixInt">
<title>ata_piix Internals</title>
!Idrivers/scsi/ata_piix.c
diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl
new file mode 100644
index 0000000..1becf27
--- /dev/null
+++ b/Documentation/DocBook/rapidio.tmpl
@@ -0,0 +1,160 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
+ <!ENTITY rapidio SYSTEM "rapidio.xml">
+ ]>
+
+<book id="RapidIO-Guide">
+ <bookinfo>
+ <title>RapidIO Subsystem Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Matt</firstname>
+ <surname>Porter</surname>
+ <affiliation>
+ <address>
+ <email>mporter@kernel.crashing.org</email>
+ <email>mporter@mvista.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2005</year>
+ <holder>MontaVista Software, Inc.</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ RapidIO is a high speed switched fabric interconnect with
+ features aimed at the embedded market. RapidIO provides
+ support for memory-mapped I/O as well as message-based
+ transactions over the switched fabric network. RapidIO has
+ a standardized discovery mechanism not unlike the PCI bus
+ standard that allows simple detection of devices in a
+ network.
+ </para>
+ <para>
+ This documentation is provided for developers intending
+ to support RapidIO on new architectures, write new drivers,
+ or to understand the subsystem internals.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs and Limitations</title>
+
+ <sect1>
+ <title>Bugs</title>
+ <para>None. ;)</para>
+ </sect1>
+ <sect1>
+ <title>Limitations</title>
+ <para>
+ <orderedlist>
+ <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem>
+ <listitem><para>Multiple host enumeration is not supported</para></listitem>
+ </orderedlist>
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="drivers">
+ <title>RapidIO driver interface</title>
+ <para>
+ Drivers are provided a set of calls in order
+ to interface with the subsystem to gather info
+ on devices, request/map memory region resources,
+ and manage mailboxes/doorbells.
+ </para>
+ <sect1>
+ <title>Functions</title>
+!Iinclude/linux/rio_drv.h
+!Edrivers/rapidio/rio-driver.c
+!Edrivers/rapidio/rio.c
+ </sect1>
+ </chapter>
+
+ <chapter id="internals">
+ <title>Internals</title>
+
+ <para>
+ This chapter contains the autogenerated documentation of the RapidIO
+ subsystem.
+ </para>
+
+ <sect1><title>Structures</title>
+!Iinclude/linux/rio.h
+ </sect1>
+ <sect1><title>Enumeration and Discovery</title>
+!Idrivers/rapidio/rio-scan.c
+ </sect1>
+ <sect1><title>Driver functionality</title>
+!Idrivers/rapidio/rio.c
+!Idrivers/rapidio/rio-access.c
+ </sect1>
+ <sect1><title>Device model support</title>
+!Idrivers/rapidio/rio-driver.c
+ </sect1>
+ <sect1><title>Sysfs support</title>
+!Idrivers/rapidio/rio-sysfs.c
+ </sect1>
+ <sect1><title>PPC32 support</title>
+!Iarch/ppc/kernel/rio.c
+!Earch/ppc/syslib/ppc85xx_rio.c
+!Iarch/ppc/syslib/ppc85xx_rio.c
+ </sect1>
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The following people have contributed to the RapidIO
+ subsystem directly or indirectly:
+ <orderedlist>
+ <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
+ <listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem>
+ <listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem>
+ </orderedlist>
+ </para>
+ <para>
+ The following people have contributed to this document:
+ <orderedlist>
+ <listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
+ </orderedlist>
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/stylesheet.xsl b/Documentation/DocBook/stylesheet.xsl
index 64be9f7..3ccce88 100644
--- a/Documentation/DocBook/stylesheet.xsl
+++ b/Documentation/DocBook/stylesheet.xsl
@@ -3,4 +3,5 @@
<param name="chunk.quietly">1</param>
<param name="funcsynopsis.style">ansi</param>
<param name="funcsynopsis.tabular.threshold">80</param>
+<!-- <param name="paper.type">A4</param> -->
</stylesheet>
diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl
index 705c442..15ce0f2 100644
--- a/Documentation/DocBook/usb.tmpl
+++ b/Documentation/DocBook/usb.tmpl
@@ -291,7 +291,7 @@
!Edrivers/usb/core/hcd.c
!Edrivers/usb/core/hcd-pci.c
-!Edrivers/usb/core/buffer.c
+!Idrivers/usb/core/buffer.c
</chapter>
<chapter>
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl
index 51f3bfb..008a341 100644
--- a/Documentation/DocBook/writing_usb_driver.tmpl
+++ b/Documentation/DocBook/writing_usb_driver.tmpl
@@ -345,8 +345,7 @@ if (!retval) {
<programlisting>
static inline void skel_delete (struct usb_skel *dev)
{
- if (dev->bulk_in_buffer != NULL)
- kfree (dev->bulk_in_buffer);
+ kfree (dev->bulk_in_buffer);
if (dev->bulk_out_buffer != NULL)
usb_buffer_free (dev->udev, dev->bulk_out_size,
dev->bulk_out_buffer,
diff --git a/Documentation/HOWTO b/Documentation/HOWTO
new file mode 100644
index 0000000..6c9e746
--- /dev/null
+++ b/Documentation/HOWTO
@@ -0,0 +1,618 @@
+HOWTO do Linux kernel development
+---------------------------------
+
+This is the be-all, end-all document on this topic. It contains
+instructions on how to become a Linux kernel developer and how to learn
+to work with the Linux kernel development community. It tries to not
+contain anything related to the technical aspects of kernel programming,
+but will help point you in the right direction for that.
+
+If anything in this document becomes out of date, please send in patches
+to the maintainer of this file, who is listed at the bottom of the
+document.
+
+
+Introduction
+------------
+
+So, you want to learn how to become a Linux kernel developer? Or you
+have been told by your manager, "Go write a Linux driver for this
+device." This document's goal is to teach you everything you need to
+know to achieve this by describing the process you need to go through,
+and hints on how to work with the community. It will also try to
+explain some of the reasons why the community works like it does.
+
+The kernel is written mostly in C, with some architecture-dependent
+parts written in assembly. A good understanding of C is required for
+kernel development. Assembly (any architecture) is not required unless
+you plan to do low-level development for that architecture. Though they
+are not a good substitute for a solid C education and/or years of
+experience, the following books are good for, if anything, reference:
+ - "The C Programming Language" by Kernighan and Ritchie [Prentice Hall]
+ - "Practical C Programming" by Steve Oualline [O'Reilly]
+
+The kernel is written using GNU C and the GNU toolchain. While it
+adheres to the ISO C89 standard, it uses a number of extensions that are
+not featured in the standard. The kernel is a freestanding C
+environment, with no reliance on the standard C library, so some
+portions of the C standard are not supported. Arbitrary long long
+divisions and floating point are not allowed. It can sometimes be
+difficult to understand the assumptions the kernel has on the toolchain
+and the extensions that it uses, and unfortunately there is no
+definitive reference for them. Please check the gcc info pages (`info
+gcc`) for some information on them.
+
+Please remember that you are trying to learn how to work with the
+existing development community. It is a diverse group of people, with
+high standards for coding, style and procedure. These standards have
+been created over time based on what they have found to work best for
+such a large and geographically dispersed team. Try to learn as much as
+possible about these standards ahead of time, as they are well
+documented; do not expect people to adapt to you or your company's way
+of doing things.
+
+
+Legal Issues
+------------
+
+The Linux kernel source code is released under the GPL. Please see the
+file, COPYING, in the main directory of the source tree, for details on
+the license. If you have further questions about the license, please
+contact a lawyer, and do not ask on the Linux kernel mailing list. The
+people on the mailing lists are not lawyers, and you should not rely on
+their statements on legal matters.
+
+For common questions and answers about the GPL, please see:
+ http://www.gnu.org/licenses/gpl-faq.html
+
+
+Documentation
+------------
+
+The Linux kernel source tree has a large range of documents that are
+invaluable for learning how to interact with the kernel community. When
+new features are added to the kernel, it is recommended that new
+documentation files are also added which explain how to use the feature.
+When a kernel change causes the interface that the kernel exposes to
+userspace to change, it is recommended that you send the information or
+a patch to the manual pages explaining the change to the manual pages
+maintainer at mtk-manpages@gmx.net.
+
+Here is a list of files that are in the kernel source tree that are
+required reading:
+ README
+ This file gives a short background on the Linux kernel and describes
+ what is necessary to do to configure and build the kernel. People
+ who are new to the kernel should start here.
+
+ Documentation/Changes
+ This file gives a list of the minimum levels of various software
+ packages that are necessary to build and run the kernel
+ successfully.
+
+ Documentation/CodingStyle
+ This describes the Linux kernel coding style, and some of the
+ rationale behind it. All new code is expected to follow the
+ guidelines in this document. Most maintainers will only accept
+ patches if these rules are followed, and many people will only
+ review code if it is in the proper style.
+
+ Documentation/SubmittingPatches
+ Documentation/SubmittingDrivers
+ These files describe in explicit detail how to successfully create
+ and send a patch, including (but not limited to):
+ - Email contents
+ - Email format
+ - Who to send it to
+ Following these rules will not guarantee success (as all patches are
+ subject to scrutiny for content and style), but not following them
+ will almost always prevent it.
+
+ Other excellent descriptions of how to create patches properly are:
+ "The Perfect Patch"
+ http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+ "Linux kernel patch submission format"
+ http://linux.yyz.us/patch-format.html
+
+ Documentation/stable_api_nonsense.txt
+ This file describes the rationale behind the conscious decision to
+ not have a stable API within the kernel, including things like:
+ - Subsystem shim-layers (for compatibility?)
+ - Driver portability between Operating Systems.
+ - Mitigating rapid change within the kernel source tree (or
+ preventing rapid change)
+ This document is crucial for understanding the Linux development
+ philosophy and is very important for people moving to Linux from
+ development on other Operating Systems.
+
+ Documentation/SecurityBugs
+ If you feel you have found a security problem in the Linux kernel,
+ please follow the steps in this document to help notify the kernel
+ developers, and help solve the issue.
+
+ Documentation/ManagementStyle
+ This document describes how Linux kernel maintainers operate and the
+ shared ethos behind their methodologies. This is important reading
+ for anyone new to kernel development (or anyone simply curious about
+ it), as it resolves a lot of common misconceptions and confusion
+ about the unique behavior of kernel maintainers.
+
+ Documentation/stable_kernel_rules.txt
+ This file describes the rules on how the stable kernel releases
+ happen, and what to do if you want to get a change into one of these
+ releases.
+
+ Documentation/kernel-docs.txt
+ A list of external documentation that pertains to kernel
+ development. Please consult this list if you do not find what you
+ are looking for within the in-kernel documentation.
+
+ Documentation/applying-patches.txt
+ A good introduction describing exactly what a patch is and how to
+ apply it to the different development branches of the kernel.
+
+The kernel also has a large number of documents that can be
+automatically generated from the source code itself. This includes a
+full description of the in-kernel API, and rules on how to handle
+locking properly. The documents will be created in the
+Documentation/DocBook/ directory and can be generated as PDF,
+Postscript, HTML, and man pages by running:
+ make pdfdocs
+ make psdocs
+ make htmldocs
+ make mandocs
+respectively from the main kernel source directory.
+
+
+Becoming A Kernel Developer
+---------------------------
+
+If you do not know anything about Linux kernel development, you should
+look at the Linux KernelNewbies project:
+ http://kernelnewbies.org
+It consists of a helpful mailing list where you can ask almost any type
+of basic kernel development question (make sure to search the archives
+first, before asking something that has already been answered in the
+past.) It also has an IRC channel that you can use to ask questions in
+real-time, and a lot of helpful documentation that is useful for
+learning about Linux kernel development.
+
+The website has basic information about code organization, subsystems,
+and current projects (both in-tree and out-of-tree). It also describes
+some basic logistical information, like how to compile a kernel and
+apply a patch.
+
+If you do not know where you want to start, but you want to look for
+some task to start doing to join into the kernel development community,
+go to the Linux Kernel Janitor's project:
+ http://janitor.kernelnewbies.org/
+It is a great place to start. It describes a list of relatively simple
+problems that need to be cleaned up and fixed within the Linux kernel
+source tree. Working with the developers in charge of this project, you
+will learn the basics of getting your patch into the Linux kernel tree,
+and possibly be pointed in the direction of what to go work on next, if
+you do not already have an idea.
+
+If you already have a chunk of code that you want to put into the kernel
+tree, but need some help getting it in the proper form, the
+kernel-mentors project was created to help you out with this. It is a
+mailing list, and can be found at:
+ http://selenic.com/mailman/listinfo/kernel-mentors
+
+Before making any actual modifications to the Linux kernel code, it is
+imperative to understand how the code in question works. For this
+purpose, nothing is better than reading through it directly (most tricky
+bits are commented well), perhaps even with the help of specialized
+tools. One such tool that is particularly recommended is the Linux
+Cross-Reference project, which is able to present source code in a
+self-referential, indexed webpage format. An excellent up-to-date
+repository of the kernel code may be found at:
+ http://sosdg.org/~coywolf/lxr/
+
+
+The development process
+-----------------------
+
+Linux kernel development process currently consists of a few different
+main kernel "branches" and lots of different subsystem-specific kernel
+branches. These different branches are:
+ - main 2.6.x kernel tree
+ - 2.6.x.y -stable kernel tree
+ - 2.6.x -git kernel patches
+ - 2.6.x -mm kernel patches
+ - subsystem specific kernel trees and patches
+
+2.6.x kernel tree
+-----------------
+2.6.x kernels are maintained by Linus Torvalds, and can be found on
+kernel.org in the pub/linux/kernel/v2.6/ directory. Its development
+process is as follows:
+ - As soon as a new kernel is released a two weeks window is open,
+ during this period of time maintainers can submit big diffs to
+ Linus, usually the patches that have already been included in the
+ -mm kernel for a few weeks. The preferred way to submit big changes
+ is using git (the kernel's source management tool, more information
+ can be found at http://git.or.cz/) but plain patches are also just
+ fine.
+ - After two weeks a -rc1 kernel is released it is now possible to push
+ only patches that do not include new features that could affect the
+ stability of the whole kernel. Please note that a whole new driver
+ (or filesystem) might be accepted after -rc1 because there is no
+ risk of causing regressions with such a change as long as the change
+ is self-contained and does not affect areas outside of the code that
+ is being added. git can be used to send patches to Linus after -rc1
+ is released, but the patches need to also be sent to a public
+ mailing list for review.
+ - A new -rc is released whenever Linus deems the current git tree to
+ be in a reasonably sane state adequate for testing. The goal is to
+ release a new -rc kernel every week.
+ - Process continues until the kernel is considered "ready", the
+ process should last around 6 weeks.
+
+It is worth mentioning what Andrew Morton wrote on the linux-kernel
+mailing list about kernel releases:
+ "Nobody knows when a kernel will be released, because it's
+ released according to perceived bug status, not according to a
+ preconceived timeline."
+
+2.6.x.y -stable kernel tree
+---------------------------
+Kernels with 4 digit versions are -stable kernels. They contain
+relatively small and critical fixes for security problems or significant
+regressions discovered in a given 2.6.x kernel.
+
+This is the recommended branch for users who want the most recent stable
+kernel and are not interested in helping test development/experimental
+versions.
+
+If no 2.6.x.y kernel is available, then the highest numbered 2.6.x
+kernel is the current stable kernel.
+
+2.6.x.y are maintained by the "stable" team <stable@kernel.org>, and are
+released almost every other week.
+
+The file Documentation/stable_kernel_rules.txt in the kernel tree
+documents what kinds of changes are acceptable for the -stable tree, and
+how the release process works.
+
+2.6.x -git patches
+------------------
+These are daily snapshots of Linus' kernel tree which are managed in a
+git repository (hence the name.) These patches are usually released
+daily and represent the current state of Linus' tree. They are more
+experimental than -rc kernels since they are generated automatically
+without even a cursory glance to see if they are sane.
+
+2.6.x -mm kernel patches
+------------------------
+These are experimental kernel patches released by Andrew Morton. Andrew
+takes all of the different subsystem kernel trees and patches and mushes
+them together, along with a lot of patches that have been plucked from
+the linux-kernel mailing list. This tree serves as a proving ground for
+new features and patches. Once a patch has proved its worth in -mm for
+a while Andrew or the subsystem maintainer pushes it on to Linus for
+inclusion in mainline.
+
+It is heavily encouraged that all new patches get tested in the -mm tree
+before they are sent to Linus for inclusion in the main kernel tree.
+
+These kernels are not appropriate for use on systems that are supposed
+to be stable and they are more risky to run than any of the other
+branches.
+
+If you wish to help out with the kernel development process, please test
+and use these kernel releases and provide feedback to the linux-kernel
+mailing list if you have any problems, and if everything works properly.
+
+In addition to all the other experimental patches, these kernels usually
+also contain any changes in the mainline -git kernels available at the
+time of release.
+
+The -mm kernels are not released on a fixed schedule, but usually a few
+-mm kernels are released in between each -rc kernel (1 to 3 is common).
+
+Subsystem Specific kernel trees and patches
+-------------------------------------------
+A number of the different kernel subsystem developers expose their
+development trees so that others can see what is happening in the
+different areas of the kernel. These trees are pulled into the -mm
+kernel releases as described above.
+
+Here is a list of some of the different kernel trees available:
+ git trees:
+ - Kbuild development tree, Sam Ravnborg <sam@ravnborg.org>
+ kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
+
+ - ACPI development tree, Len Brown <len.brown@intel.com>
+ kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
+
+ - Block development tree, Jens Axboe <axboe@suse.de>
+ kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
+
+ - DRM development tree, Dave Airlie <airlied@linux.ie>
+ kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
+
+ - ia64 development tree, Tony Luck <tony.luck@intel.com>
+ kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
+
+ - ieee1394 development tree, Jody McIntyre <scjody@modernduck.com>
+ kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git
+
+ - infiniband, Roland Dreier <rolandd@cisco.com>
+ kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
+
+ - libata, Jeff Garzik <jgarzik@pobox.com>
+ kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
+
+ - network drivers, Jeff Garzik <jgarzik@pobox.com>
+ kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
+
+ - pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
+ kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
+
+ - SCSI, James Bottomley <James.Bottomley@SteelEye.com>
+ kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
+
+ Other git kernel trees can be found listed at http://kernel.org/git
+
+ quilt trees:
+ - USB, PCI, Driver Core, and I2C, Greg Kroah-Hartman <gregkh@suse.de>
+ kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+
+
+Bug Reporting
+-------------
+
+bugzilla.kernel.org is where the Linux kernel developers track kernel
+bugs. Users are encouraged to report all bugs that they find in this
+tool. For details on how to use the kernel bugzilla, please see:
+ http://test.kernel.org/bugzilla/faq.html
+
+The file REPORTING-BUGS in the main kernel source directory has a good
+template for how to report a possible kernel bug, and details what kind
+of information is needed by the kernel developers to help track down the
+problem.
+
+
+Mailing lists
+-------------
+
+As some of the above documents describe, the majority of the core kernel
+developers participate on the Linux Kernel Mailing list. Details on how
+to subscribe and unsubscribe from the list can be found at:
+ http://vger.kernel.org/vger-lists.html#linux-kernel
+There are archives of the mailing list on the web in many different
+places. Use a search engine to find these archives. For example:
+ http://dir.gmane.org/gmane.linux.kernel
+It is highly recommended that you search the archives about the topic
+you want to bring up, before you post it to the list. A lot of things
+already discussed in detail are only recorded at the mailing list
+archives.
+
+Most of the individual kernel subsystems also have their own separate
+mailing list where they do their development efforts. See the
+MAINTAINERS file for a list of what these lists are for the different
+groups.
+
+Many of the lists are hosted on kernel.org. Information on them can be
+found at:
+ http://vger.kernel.org/vger-lists.html
+
+Please remember to follow good behavioral habits when using the lists.
+Though a bit cheesy, the following URL has some simple guidelines for
+interacting with the list (or any list):
+ http://www.albion.com/netiquette/
+
+If multiple people respond to your mail, the CC: list of recipients may
+get pretty large. Don't remove anybody from the CC: list without a good
+reason, or don't reply only to the list address. Get used to receiving the
+mail twice, one from the sender and the one from the list, and don't try
+to tune that by adding fancy mail-headers, people will not like it.
+
+Remember to keep the context and the attribution of your replies intact,
+keep the "John Kernelhacker wrote ...:" lines at the top of your reply, and
+add your statements between the individual quoted sections instead of
+writing at the top of the mail.
+
+If you add patches to your mail, make sure they are plain readable text
+as stated in Documentation/SubmittingPatches. Kernel developers don't
+want to deal with attachments or compressed patches; they may want
+to comment on individual lines of your patch, which works only that way.
+Make sure you use a mail program that does not mangle spaces and tab
+characters. A good first test is to send the mail to yourself and try
+to apply your own patch by yourself. If that doesn't work, get your
+mail program fixed or change it until it works.
+
+Above all, please remember to show respect to other subscribers.
+
+
+Working with the community
+--------------------------
+
+The goal of the kernel community is to provide the best possible kernel
+there is. When you submit a patch for acceptance, it will be reviewed
+on its technical merits and those alone. So, what should you be
+expecting?
+ - criticism
+ - comments
+ - requests for change
+ - requests for justification
+ - silence
+
+Remember, this is part of getting your patch into the kernel. You have
+to be able to take criticism and comments about your patches, evaluate
+them at a technical level and either rework your patches or provide
+clear and concise reasoning as to why those changes should not be made.
+If there are no responses to your posting, wait a few days and try
+again, sometimes things get lost in the huge volume.
+
+What should you not do?
+ - expect your patch to be accepted without question
+ - become defensive
+ - ignore comments
+ - resubmit the patch without making any of the requested changes
+
+In a community that is looking for the best technical solution possible,
+there will always be differing opinions on how beneficial a patch is.
+You have to be cooperative, and willing to adapt your idea to fit within
+the kernel. Or at least be willing to prove your idea is worth it.
+Remember, being wrong is acceptable as long as you are willing to work
+toward a solution that is right.
+
+It is normal that the answers to your first patch might simply be a list
+of a dozen things you should correct. This does _not_ imply that your
+patch will not be accepted, and it is _not_ meant against you
+personally. Simply correct all issues raised against your patch and
+resend it.
+
+
+Differences between the kernel community and corporate structures
+-----------------------------------------------------------------
+
+The kernel community works differently than most traditional corporate
+development environments. Here are a list of things that you can try to
+do to try to avoid problems:
+ Good things to say regarding your proposed changes:
+ - "This solves multiple problems."
+ - "This deletes 2000 lines of code."
+ - "Here is a patch that explains what I am trying to describe."
+ - "I tested it on 5 different architectures..."
+ - "Here is a series of small patches that..."
+ - "This increases performance on typical machines..."
+
+ Bad things you should avoid saying:
+ - "We did it this way in AIX/ptx/Solaris, so therefore it must be
+ good..."
+ - "I've being doing this for 20 years, so..."
+ - "This is required for my company to make money"
+ - "This is for our Enterprise product line."
+ - "Here is my 1000 page design document that describes my idea"
+ - "I've been working on this for 6 months..."
+ - "Here's a 5000 line patch that..."
+ - "I rewrote all of the current mess, and here it is..."
+ - "I have a deadline, and this patch needs to be applied now."
+
+Another way the kernel community is different than most traditional
+software engineering work environments is the faceless nature of
+interaction. One benefit of using email and irc as the primary forms of
+communication is the lack of discrimination based on gender or race.
+The Linux kernel work environment is accepting of women and minorities
+because all you are is an email address. The international aspect also
+helps to level the playing field because you can't guess gender based on
+a person's name. A man may be named Andrea and a woman may be named Pat.
+Most women who have worked in the Linux kernel and have expressed an
+opinion have had positive experiences.
+
+The language barrier can cause problems for some people who are not
+comfortable with English. A good grasp of the language can be needed in
+order to get ideas across properly on mailing lists, so it is
+recommended that you check your emails to make sure they make sense in
+English before sending them.
+
+
+Break up your changes
+---------------------
+
+The Linux kernel community does not gladly accept large chunks of code
+dropped on it all at once. The changes need to be properly introduced,
+discussed, and broken up into tiny, individual portions. This is almost
+the exact opposite of what companies are used to doing. Your proposal
+should also be introduced very early in the development process, so that
+you can receive feedback on what you are doing. It also lets the
+community feel that you are working with them, and not simply using them
+as a dumping ground for your feature. However, don't send 50 emails at
+one time to a mailing list, your patch series should be smaller than
+that almost all of the time.
+
+The reasons for breaking things up are the following:
+
+1) Small patches increase the likelihood that your patches will be
+ applied, since they don't take much time or effort to verify for
+ correctness. A 5 line patch can be applied by a maintainer with
+ barely a second glance. However, a 500 line patch may take hours to
+ review for correctness (the time it takes is exponentially
+ proportional to the size of the patch, or something).
+
+ Small patches also make it very easy to debug when something goes
+ wrong. It's much easier to back out patches one by one than it is
+ to dissect a very large patch after it's been applied (and broken
+ something).
+
+2) It's important not only to send small patches, but also to rewrite
+ and simplify (or simply re-order) patches before submitting them.
+
+Here is an analogy from kernel developer Al Viro:
+ "Think of a teacher grading homework from a math student. The
+ teacher does not want to see the student's trials and errors
+ before they came up with the solution. They want to see the
+ cleanest, most elegant answer. A good student knows this, and
+ would never submit her intermediate work before the final
+ solution."
+
+ The same is true of kernel development. The maintainers and
+ reviewers do not want to see the thought process behind the
+ solution to the problem one is solving. They want to see a
+ simple and elegant solution."
+
+It may be challenging to keep the balance between presenting an elegant
+solution and working together with the community and discussing your
+unfinished work. Therefore it is good to get early in the process to
+get feedback to improve your work, but also keep your changes in small
+chunks that they may get already accepted, even when your whole task is
+not ready for inclusion now.
+
+Also realize that it is not acceptable to send patches for inclusion
+that are unfinished and will be "fixed up later."
+
+
+Justify your change
+-------------------
+
+Along with breaking up your patches, it is very important for you to let
+the Linux community know why they should add this change. New features
+must be justified as being needed and useful.
+
+
+Document your change
+--------------------
+
+When sending in your patches, pay special attention to what you say in
+the text in your email. This information will become the ChangeLog
+information for the patch, and will be preserved for everyone to see for
+all time. It should describe the patch completely, containing:
+ - why the change is necessary
+ - the overall design approach in the patch
+ - implementation details
+ - testing results
+
+For more details on what this should all look like, please see the
+ChangeLog section of the document:
+ "The Perfect Patch"
+ http://www.zip.com.au/~akpm/linux/patches/stuff/tpp.txt
+
+
+
+
+All of these things are sometimes very hard to do. It can take years to
+perfect these practices (if at all). It's a continuous process of
+improvement that requires a lot of patience and determination. But
+don't give up, it's possible. Many have done it before, and each had to
+start exactly where you are now.
+
+
+
+
+----------
+Thanks to Paolo Ciarrocchi who allowed the "Development Process" section
+to be based on text he had written, and to Randy Dunlap and Gerrit
+Huizenga for some of the list of things you should and should not say.
+Also thanks to Pat Mochel, Hanna Linder, Randy Dunlap, Kay Sievers,
+Vojtech Pavlik, Jan Kara, Josh Boyer, Kees Cook, Andrew Morton, Andi
+Kleen, Vadim Lobanov, Jesper Juhl, Adrian Bunk, Keri Harris, Frans Pop,
+David A. Wheeler, Junio Hamano, Michael Kerrisk, and Alex Shepard for
+their review, comments, and contributions. Without their help, this
+document would not have been possible.
+
+
+
+Maintainer: Greg Kroah-Hartman <greg@kroah.com>
diff --git a/Documentation/MSI-HOWTO.txt b/Documentation/MSI-HOWTO.txt
index 63edc5f..3ec6c72 100644
--- a/Documentation/MSI-HOWTO.txt
+++ b/Documentation/MSI-HOWTO.txt
@@ -10,14 +10,22 @@
This guide describes the basics of Message Signaled Interrupts (MSI),
the advantages of using MSI over traditional interrupt mechanisms,
and how to enable your driver to use MSI or MSI-X. Also included is
-a Frequently Asked Questions.
+a Frequently Asked Questions (FAQ) section.
+
+1.1 Terminology
+
+PCI devices can be single-function or multi-function. In either case,
+when this text talks about enabling or disabling MSI on a "device
+function," it is referring to one specific PCI device and function and
+not to all functions on a PCI device (unless the PCI device has only
+one function).
2. Copyright 2003 Intel Corporation
3. What is MSI/MSI-X?
Message Signaled Interrupt (MSI), as described in the PCI Local Bus
-Specification Revision 2.3 or latest, is an optional feature, and a
+Specification Revision 2.3 or later, is an optional feature, and a
required feature for PCI Express devices. MSI enables a device function
to request service by sending an Inbound Memory Write on its PCI bus to
the FSB as a Message Signal Interrupt transaction. Because MSI is
@@ -27,7 +35,7 @@ supported.
A PCI device that supports MSI must also support pin IRQ assertion
interrupt mechanism to provide backward compatibility for systems that
-do not support MSI. In Systems, which support MSI, the bus driver is
+do not support MSI. In systems which support MSI, the bus driver is
responsible for initializing the message address and message data of
the device function's MSI/MSI-X capability structure during device
initial configuration.
@@ -61,17 +69,17 @@ over the MSI capability structure as des
- MSI and MSI-X both support per-vector masking. Per-vector
masking is an optional extension of MSI but a required
- feature for MSI-X. Per-vector masking provides the kernel
- the ability to mask/unmask MSI when servicing its software
- interrupt service routing handler. If per-vector masking is
+ feature for MSI-X. Per-vector masking provides the kernel the
+ ability to mask/unmask a single MSI while running its
+ interrupt service routine. If per-vector masking is
not supported, then the device driver should provide the
hardware/software synchronization to ensure that the device
generates MSI when the driver wants it to do so.
4. Why use MSI?
-As a benefit the simplification of board design, MSI allows board
-designers to remove out of band interrupt routing. MSI is another
+As a benefit to the simplification of board design, MSI allows board
+designers to remove out-of-band interrupt routing. MSI is another
step towards a legacy-free environment.
Due to increasing pressure on chipset and processor packages to
@@ -87,7 +95,7 @@ support. As a result, the PCI Express te
support for better interrupt performance.
Using MSI enables the device functions to support two or more
-vectors, which can be configured to target different CPU's to
+vectors, which can be configured to target different CPUs to
increase scalability.
5. Configuring a driver to use MSI/MSI-X
@@ -119,13 +127,13 @@ pci_enable_msi() explicitly.
int pci_enable_msi(struct pci_dev *dev)
-With this new API, any existing device driver, which like to have
-MSI enabled on its device function, must call this API to enable MSI
+With this new API, a device driver that wants to have MSI
+enabled on its device function must call this API to enable MSI.
A successful call will initialize the MSI capability structure
with ONE vector, regardless of whether a device function is
capable of supporting multiple messages. This vector replaces the
-pre-assigned dev->irq with a new MSI vector. To avoid the conflict
-of new assigned vector with existing pre-assigned vector requires
+pre-assigned dev->irq with a new MSI vector. To avoid a conflict
+of the new assigned vector with existing pre-assigned vector requires
a device driver to call this API before calling request_irq().
5.2.2 API pci_disable_msi
@@ -137,14 +145,14 @@ when a device driver is unloading. This
the pre-assigned IOAPIC vector and switches a device's interrupt
mode to PCI pin-irq assertion/INTx emulation mode.
-Note that a device driver should always call free_irq() on MSI vector
-it has done request_irq() on before calling this API. Failure to do
-so results a BUG_ON() and a device will be left with MSI enabled and
+Note that a device driver should always call free_irq() on the MSI vector
+that it has done request_irq() on before calling this API. Failure to do
+so results in a BUG_ON() and a device will be left with MSI enabled and
leaks its vector.
5.2.3 MSI mode vs. legacy mode diagram
-The below diagram shows the events, which switches the interrupt
+The below diagram shows the events which switch the interrupt
mode on the MSI-capable device function between MSI mode and
PIN-IRQ assertion mode.
@@ -155,9 +163,9 @@ PIN-IRQ assertion mode.
------------ pci_disable_msi ------------------------
-Figure 1.0 MSI Mode vs. Legacy Mode
+Figure 1. MSI Mode vs. Legacy Mode
-In Figure 1.0, a device operates by default in legacy mode. Legacy
+In Figure 1, a device operates by default in legacy mode. Legacy
in this context means PCI pin-irq assertion or PCI-Express INTx
emulation. A successful MSI request (using pci_enable_msi()) switches
a device's interrupt mode to MSI mode. A pre-assigned IOAPIC vector
@@ -166,11 +174,11 @@ assigned MSI vector will replace dev->ir
To return back to its default mode, a device driver should always call
pci_disable_msi() to undo the effect of pci_enable_msi(). Note that a
-device driver should always call free_irq() on MSI vector it has done
-request_irq() on before calling pci_disable_msi(). Failure to do so
-results a BUG_ON() and a device will be left with MSI enabled and
+device driver should always call free_irq() on the MSI vector it has
+done request_irq() on before calling pci_disable_msi(). Failure to do
+so results in a BUG_ON() and a device will be left with MSI enabled and
leaks its vector. Otherwise, the PCI subsystem restores a device's
-dev->irq with a pre-assigned IOAPIC vector and marks released
+dev->irq with a pre-assigned IOAPIC vector and marks the released
MSI vector as unused.
Once being marked as unused, there is no guarantee that the PCI
@@ -178,8 +186,8 @@ subsystem will reserve this MSI vector f
the availability of current PCI vector resources and the number of
MSI/MSI-X requests from other drivers, this MSI may be re-assigned.
-For the case where the PCI subsystem re-assigned this MSI vector
-another driver, a request to switching back to MSI mode may result
+For the case where the PCI subsystem re-assigns this MSI vector to
+another driver, a request to switch back to MSI mode may result
in being assigned a different MSI vector or a failure if no more
vectors are available.
@@ -208,12 +216,12 @@ Unlike the function pci_enable_msi(), th
does not replace the pre-assigned IOAPIC dev->irq with a new MSI
vector because the PCI subsystem writes the 1:1 vector-to-entry mapping
into the field vector of each element contained in a second argument.
-Note that the pre-assigned IO-APIC dev->irq is valid only if the device
-operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt of
+Note that the pre-assigned IOAPIC dev->irq is valid only if the device
+operates in PIN-IRQ assertion mode. In MSI-X mode, any attempt at
using dev->irq by the device driver to request for interrupt service
may result unpredictabe behavior.
-For each MSI-X vector granted, a device driver is responsible to call
+For each MSI-X vector granted, a device driver is responsible for calling
other functions like request_irq(), enable_irq(), etc. to enable
this vector with its corresponding interrupt service handler. It is
a device driver's choice to assign all vectors with the same
@@ -224,13 +232,13 @@ service handler.
The PCI 3.0 specification has implementation notes that MMIO address
space for a device's MSI-X structure should be isolated so that the
-software system can set different page for controlling accesses to
-the MSI-X structure. The implementation of MSI patch requires the PCI
+software system can set different pages for controlling accesses to the
+MSI-X structure. The implementation of MSI support requires the PCI
subsystem, not a device driver, to maintain full control of the MSI-X
-table/MSI-X PBA and MMIO address space of the MSI-X table/MSI-X PBA.
-A device driver is prohibited from requesting the MMIO address space
-of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem will fail
-enabling MSI-X on its hardware device when it calls the function
+table/MSI-X PBA (Pending Bit Array) and MMIO address space of the MSI-X
+table/MSI-X PBA. A device driver is prohibited from requesting the MMIO
+address space of the MSI-X table/MSI-X PBA. Otherwise, the PCI subsystem
+will fail enabling MSI-X on its hardware device when it calls the function
pci_enable_msix().
5.3.2 Handling MSI-X allocation
@@ -274,9 +282,9 @@ For the case where fewer MSI-X vectors a
than requested, the function pci_enable_msix() will return the
maximum number of MSI-X vectors available to the caller. A device
driver may re-send its request with fewer or equal vectors indicated
-in a return. For example, if a device driver requests 5 vectors, but
-the number of available vectors is 3 vectors, a value of 3 will be a
-return as a result of pci_enable_msix() call. A function could be
+in the return. For example, if a device driver requests 5 vectors, but
+the number of available vectors is 3 vectors, a value of 3 will be
+returned as a result of pci_enable_msix() call. A function could be
designed for its driver to use only 3 MSI-X table entries as
different combinations as ABC--, A-B-C, A--CB, etc. Note that this
patch does not support multiple entries with the same vector. Such
@@ -285,49 +293,46 @@ as ABBCC, AABCC, BCCBA, etc will result
pci_enable_msix(). Below are the reasons why supporting multiple
entries with the same vector is an undesirable solution.
- - The PCI subsystem can not determine which entry, which
- generated the message, to mask/unmask MSI while handling
+ - The PCI subsystem cannot determine the entry that
+ generated the message to mask/unmask MSI while handling
software driver ISR. Attempting to walk through all MSI-X
table entries (2048 max) to mask/unmask any match vector
is an undesirable solution.
- - Walk through all MSI-X table entries (2048 max) to handle
+ - Walking through all MSI-X table entries (2048 max) to handle
SMP affinity of any match vector is an undesirable solution.
5.3.4 API pci_enable_msix
-int pci_enable_msix(struct pci_dev *dev, u32 *entries, int nvec)
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
This API enables a device driver to request the PCI subsystem
-for enabling MSI-X messages on its hardware device. Depending on
+to enable MSI-X messages on its hardware device. Depending on
the availability of PCI vectors resources, the PCI subsystem enables
-either all or nothing.
+either all or none of the requested vectors.
-Argument dev points to the device (pci_dev) structure.
+Argument 'dev' points to the device (pci_dev) structure.
-Argument entries is a pointer of unsigned integer type. The number of
-elements is indicated in argument nvec. The content of each element
-will be mapped to the following struct defined in /driver/pci/msi.h.
+Argument 'entries' is a pointer to an array of msix_entry structs.
+The number of entries is indicated in argument 'nvec'.
+struct msix_entry is defined in /driver/pci/msi.h:
struct msix_entry {
u16 vector; /* kernel uses to write alloc vector */
u16 entry; /* driver uses to specify entry */
};
-A device driver is responsible for initializing the field entry of
-each element with unique entry supported by MSI-X table. Otherwise,
+A device driver is responsible for initializing the field 'entry' of
+each element with a unique entry supported by MSI-X table. Otherwise,
-EINVAL will be returned as a result. A successful return of zero
-indicates the PCI subsystem completes initializing each of requested
+indicates the PCI subsystem completed initializing each of the requested
entries of the MSI-X table with message address and message data.
Last but not least, the PCI subsystem will write the 1:1
-vector-to-entry mapping into the field vector of each element. A
-device driver is responsible of keeping track of allocated MSI-X
+vector-to-entry mapping into the field 'vector' of each element. A
+device driver is responsible for keeping track of allocated MSI-X
vectors in its internal data structure.
-Argument nvec is an integer indicating the number of messages
-requested.
-
-A return of zero indicates that the number of MSI-X vectors is
+A return of zero indicates that the number of MSI-X vectors was
successfully allocated. A return of greater than zero indicates
MSI-X vector shortage. Or a return of less than zero indicates
a failure. This failure may be a result of duplicate entries
@@ -341,12 +346,12 @@ void pci_disable_msix(struct pci_dev *de
This API should always be used to undo the effect of pci_enable_msix()
when a device driver is unloading. Note that a device driver should
always call free_irq() on all MSI-X vectors it has done request_irq()
-on before calling this API. Failure to do so results a BUG_ON() and
+on before calling this API. Failure to do so results in a BUG_ON() and
a device will be left with MSI-X enabled and leaks its vectors.
5.3.6 MSI-X mode vs. legacy mode diagram
-The below diagram shows the events, which switches the interrupt
+The below diagram shows the events which switch the interrupt
mode on the MSI-X capable device function between MSI-X mode and
PIN-IRQ assertion mode (legacy).
@@ -356,22 +361,22 @@ PIN-IRQ assertion mode (legacy).
| | ===============> | |
------------ pci_disable_msix ------------------------
-Figure 2.0 MSI-X Mode vs. Legacy Mode
+Figure 2. MSI-X Mode vs. Legacy Mode
-In Figure 2.0, a device operates by default in legacy mode. A
+In Figure 2, a device operates by default in legacy mode. A
successful MSI-X request (using pci_enable_msix()) switches a
device's interrupt mode to MSI-X mode. A pre-assigned IOAPIC vector
stored in dev->irq will be saved by the PCI subsystem; however,
unlike MSI mode, the PCI subsystem will not replace dev->irq with
assigned MSI-X vector because the PCI subsystem already writes the 1:1
-vector-to-entry mapping into the field vector of each element
+vector-to-entry mapping into the field 'vector' of each element
specified in second argument.
To return back to its default mode, a device driver should always call
pci_disable_msix() to undo the effect of pci_enable_msix(). Note that
a device driver should always call free_irq() on all MSI-X vectors it
has done request_irq() on before calling pci_disable_msix(). Failure
-to do so results a BUG_ON() and a device will be left with MSI-X
+to do so results in a BUG_ON() and a device will be left with MSI-X
enabled and leaks its vectors. Otherwise, the PCI subsystem switches a
device function's interrupt mode from MSI-X mode to legacy mode and
marks all allocated MSI-X vectors as unused.
@@ -383,53 +388,56 @@ MSI/MSI-X requests from other drivers, t
re-assigned.
For the case where the PCI subsystem re-assigned these MSI-X vectors
-to other driver, a request to switching back to MSI-X mode may result
+to other drivers, a request to switch back to MSI-X mode may result
being assigned with another set of MSI-X vectors or a failure if no
more vectors are available.
-5.4 Handling function implementng both MSI and MSI-X capabilities
+5.4 Handling function implementing both MSI and MSI-X capabilities
For the case where a function implements both MSI and MSI-X
capabilities, the PCI subsystem enables a device to run either in MSI
mode or MSI-X mode but not both. A device driver determines whether it
wants MSI or MSI-X enabled on its hardware device. Once a device
-driver requests for MSI, for example, it is prohibited to request for
+driver requests for MSI, for example, it is prohibited from requesting
MSI-X; in other words, a device driver is not permitted to ping-pong
between MSI mod MSI-X mode during a run-time.
5.5 Hardware requirements for MSI/MSI-X support
+
MSI/MSI-X support requires support from both system hardware and
individual hardware device functions.
5.5.1 System hardware support
+
Since the target of MSI address is the local APIC CPU, enabling
-MSI/MSI-X support in Linux kernel is dependent on whether existing
-system hardware supports local APIC. Users should verify their
-system whether it runs when CONFIG_X86_LOCAL_APIC=y.
+MSI/MSI-X support in the Linux kernel is dependent on whether existing
+system hardware supports local APIC. Users should verify that their
+system supports local APIC operation by testing that it runs when
+CONFIG_X86_LOCAL_APIC=y.
In SMP environment, CONFIG_X86_LOCAL_APIC is automatically set;
however, in UP environment, users must manually set
CONFIG_X86_LOCAL_APIC. Once CONFIG_X86_LOCAL_APIC=y, setting
-CONFIG_PCI_MSI enables the VECTOR based scheme and
-the option for MSI-capable device drivers to selectively enable
-MSI/MSI-X.
+CONFIG_PCI_MSI enables the VECTOR based scheme and the option for
+MSI-capable device drivers to selectively enable MSI/MSI-X.
Note that CONFIG_X86_IO_APIC setting is irrelevant because MSI/MSI-X
vector is allocated new during runtime and MSI/MSI-X support does not
depend on BIOS support. This key independency enables MSI/MSI-X
-support on future IOxAPIC free platform.
+support on future IOxAPIC free platforms.
5.5.2 Device hardware support
+
The hardware device function supports MSI by indicating the
MSI/MSI-X capability structure on its PCI capability list. By
default, this capability structure will not be initialized by
the kernel to enable MSI during the system boot. In other words,
the device function is running on its default pin assertion mode.
Note that in many cases the hardware supporting MSI have bugs,
-which may result in system hang. The software driver of specific
-MSI-capable hardware is responsible for whether calling
+which may result in system hangs. The software driver of specific
+MSI-capable hardware is responsible for deciding whether to call
pci_enable_msi or not. A return of zero indicates the kernel
-successfully initializes the MSI/MSI-X capability structure of the
+successfully initialized the MSI/MSI-X capability structure of the
device function. The device function is now running on MSI/MSI-X mode.
5.6 How to tell whether MSI/MSI-X is enabled on device function
@@ -439,10 +447,10 @@ pci_enable_msi()/pci_enable_msix() indic
its device function is initialized successfully and ready to run in
MSI/MSI-X mode.
-At the user level, users can use command 'cat /proc/interrupts'
-to display the vector allocated for a device and its interrupt
-MSI/MSI-X mode ("PCI MSI"/"PCI MSIX"). Below shows below MSI mode is
-enabled on a SCSI Adaptec 39320D Ultra320.
+At the user level, users can use the command 'cat /proc/interrupts'
+to display the vectors allocated for devices and their interrupt
+MSI/MSI-X modes ("PCI-MSI"/"PCI-MSI-X"). Below shows MSI mode is
+enabled on a SCSI Adaptec 39320D Ultra320 controller.
CPU0 CPU1
0: 324639 0 IO-APIC-edge timer
@@ -453,8 +461,8 @@ enabled on a SCSI Adaptec 39320D Ultra32
15: 1 0 IO-APIC-edge ide1
169: 0 0 IO-APIC-level uhci-hcd
185: 0 0 IO-APIC-level uhci-hcd
-193: 138 10 PCI MSI aic79xx
-201: 30 0 PCI MSI aic79xx
+193: 138 10 PCI-MSI aic79xx
+201: 30 0 PCI-MSI aic79xx
225: 30 0 IO-APIC-level aic7xxx
233: 30 0 IO-APIC-level aic7xxx
NMI: 0 0
@@ -490,8 +498,8 @@ target address set as 0xfeexxxxx, as con
specification 2.3 or latest, then it should work.
Q4. From the driver point of view, if the MSI is lost because
-of the errors occur during inbound memory write, then it may
-wait for ever. Is there a mechanism for it to recover?
+of errors occurring during inbound memory write, then it may
+wait forever. Is there a mechanism for it to recover?
A4. Since the target of the transaction is an inbound memory
write, all transaction termination conditions (Retry,
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
new file mode 100644
index 0000000..e4c3815
--- /dev/null
+++ b/Documentation/RCU/torture.txt
@@ -0,0 +1,122 @@
+RCU Torture Test Operation
+
+
+CONFIG_RCU_TORTURE_TEST
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations. It creates an rcutorture kernel module that can
+be loaded to run a torture test. The test periodically outputs
+status messages via printk(), which can be examined via the dmesg
+command (perhaps grepping for "rcutorture"). The test is started
+when the module is loaded, and stops when the module is unloaded.
+
+However, actually setting this config option to "y" results in the system
+running the test immediately upon boot, and ending only when the system
+is taken down. Normally, one will instead want to build the system
+with CONFIG_RCU_TORTURE_TEST=m and to use modprobe and rmmod to control
+the test, perhaps using a script similar to the one shown at the end of
+this document. Note that you will need CONFIG_MODULE_UNLOAD in order
+to be able to end the test.
+
+
+MODULE PARAMETERS
+
+This module has the following parameters:
+
+nreaders This is the number of RCU reading threads supported.
+ The default is twice the number of CPUs. Why twice?
+ To properly exercise RCU implementations with preemptible
+ read-side critical sections.
+
+stat_interval The number of seconds between output of torture
+ statistics (via printk()). Regardless of the interval,
+ statistics are printed when the module is unloaded.
+ Setting the interval to zero causes the statistics to
+ be printed -only- when the module is unloaded, and this
+ is the default.
+
+verbose Enable debug printk()s. Default is disabled.
+
+
+OUTPUT
+
+The statistics output is as follows:
+
+ rcutorture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
+ rcutorture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
+ rcutorture: Reader Pipe: 1466408 9747 0 0 0 0 0 0 0 0 0
+ rcutorture: Reader Batch: 1464477 11678 0 0 0 0 0 0 0 0
+ rcutorture: Free-Block Circulation: 1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
+ rcutorture: --- End of test
+
+The command "dmesg | grep rcutorture:" will extract this information on
+most systems. On more esoteric configurations, it may be necessary to
+use other commands to access the output of the printk()s used by
+the RCU torture test. The printk()s use KERN_ALERT, so they should
+be evident. ;-)
+
+The entries are as follows:
+
+o "ggp": The number of counter flips (or batches) since boot.
+
+o "rtc": The hexadecimal address of the structure currently visible
+ to readers.
+
+o "ver": The number of times since boot that the rcutw writer task
+ has changed the structure visible to readers.
+
+o "tfle": If non-zero, indicates that the "torture freelist"
+ containing structure to be placed into the "rtc" area is empty.
+ This condition is important, since it can fool you into thinking
+ that RCU is working when it is not. :-/
+
+o "rta": Number of structures allocated from the torture freelist.
+
+o "rtaf": Number of allocations from the torture freelist that have
+ failed due to the list being empty.
+
+o "rtf": Number of frees into the torture freelist.
+
+o "Reader Pipe": Histogram of "ages" of structures seen by readers.
+ If any entries past the first two are non-zero, RCU is broken.
+ And rcutorture prints the error flag string "!!!" to make sure
+ you notice. The age of a newly allocated structure is zero,
+ it becomes one when removed from reader visibility, and is
+ incremented once per grace period subsequently -- and is freed
+ after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+ The output displayed above was taken from a correctly working
+ RCU. If you want to see what it looks like when broken, break
+ it yourself. ;-)
+
+o "Reader Batch": Another histogram of "ages" of structures seen
+ by readers, but in terms of counter flips (or batches) rather
+ than in terms of grace periods. The legal number of non-zero
+ entries is again two. The reason for this separate view is
+ that it is easier to get the third entry to show up in the
+ "Reader Batch" list than in the "Reader Pipe" list.
+
+o "Free-Block Circulation": Shows the number of torture structures
+ that have reached a given point in the pipeline. The first element
+ should closely correspond to the number of structures allocated,
+ the second to the number that have been removed from reader view,
+ and all but the last remaining to the corresponding number of
+ passes through a grace period. The last entry should be zero,
+ as it is only incremented if a torture structure's counter
+ somehow gets incremented farther than it should.
+
+
+USAGE
+
+The following script may be used to torture RCU:
+
+ #!/bin/sh
+
+ modprobe rcutorture
+ sleep 100
+ rmmod rcutorture
+ dmesg | grep rcutorture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 354d89c..15da168 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -772,8 +772,6 @@ RCU pointer/list traversal:
list_for_each_entry_rcu
list_for_each_continue_rcu (to be deprecated in favor of new
list_for_each_entry_continue_rcu)
- hlist_for_each_rcu (to be deprecated in favor of
- hlist_for_each_entry_rcu)
hlist_for_each_entry_rcu
RCU pointer update:
diff --git a/Documentation/arm/00-INDEX b/Documentation/arm/00-INDEX
index d753fe5..2c6a3b3 100644
--- a/Documentation/arm/00-INDEX
+++ b/Documentation/arm/00-INDEX
@@ -16,5 +16,7 @@ empeg
- Empeg documentation
mem_alignment
- alignment abort handler documentation
+memory.txt
+ - description of the virtual memory layout
nwfpe
- NWFPE floating point emulator documentation
diff --git a/Documentation/arm/README b/Documentation/arm/README
index a6f718e..5ed6f35 100644
--- a/Documentation/arm/README
+++ b/Documentation/arm/README
@@ -8,10 +8,9 @@ Compilation of kernel
---------------------
In order to compile ARM Linux, you will need a compiler capable of
- generating ARM ELF code with GNU extensions. GCC 2.95.1, EGCS
- 1.1.2, and GCC 3.3 are known to be good compilers. Fortunately, you
- needn't guess. The kernel will report an error if your compiler is
- a recognized offender.
+ generating ARM ELF code with GNU extensions. GCC 3.3 is known to be
+ a good compiler. Fortunately, you needn't guess. The kernel will report
+ an error if your compiler is a recognized offender.
To build ARM Linux natively, you shouldn't have to alter the ARCH = line
in the top level Makefile. However, if you don't have the ARM Linux ELF
diff --git a/Documentation/arm/Samsung-S3C24XX/Overview.txt b/Documentation/arm/Samsung-S3C24XX/Overview.txt
index 3af4d29..89aa89d 100644
--- a/Documentation/arm/Samsung-S3C24XX/Overview.txt
+++ b/Documentation/arm/Samsung-S3C24XX/Overview.txt
@@ -81,7 +81,8 @@ Adding New Machines
Any large scale modifications, or new drivers should be discussed
on the ARM kernel mailing list (linux-arm-kernel) before being
- attempted.
+ attempted. See http://www.arm.linux.org.uk/mailinglists/ for the
+ mailing list information.
NAND
@@ -120,6 +121,43 @@ Clock Management
various clock units
+Platform Data
+-------------
+
+ Whenever a device has platform specific data that is specified
+ on a per-machine basis, care should be taken to ensure the
+ following:
+
+ 1) that default data is not left in the device to confuse the
+ driver if a machine does not set it at startup
+
+ 2) the data should (if possible) be marked as __initdata,
+ to ensure that the data is thrown away if the machine is
+ not the one currently in use.
+
+ The best way of doing this is to make a function that
+ kmalloc()s an area of memory, and copies the __initdata
+ and then sets the relevant device's platform data. Making
+ the function `__init` takes care of ensuring it is discarded
+ with the rest of the initialisation code
+
+ static __init void s3c24xx_xxx_set_platdata(struct xxx_data *pd)
+ {
+ struct s3c2410_xxx_mach_info *npd;
+
+ npd = kmalloc(sizeof(struct s3c2410_xxx_mach_info), GFP_KERNEL);
+ if (npd) {
+ memcpy(npd, pd, sizeof(struct s3c2410_xxx_mach_info));
+ s3c_device_xxx.dev.platform_data = npd;
+ } else {
+ printk(KERN_ERR "no memory for xxx platform data\n");
+ }
+ }
+
+ Note, since the code is marked as __init, it should not be
+ exported outside arch/arm/mach-s3c2410/, or exported to
+ modules via EXPORT_SYMBOL() and related functions.
+
Port Contributors
-----------------
@@ -149,6 +187,7 @@ Document Changes
06 Mar 2005 - BJD - Added Christer Weinigel
08 Mar 2005 - BJD - Added LCVR to list of people, updated introduction
08 Mar 2005 - BJD - Added section on adding machines
+ 09 Sep 2005 - BJD - Added section on platform data
Document Author
---------------
diff --git a/Documentation/arm/VFP/release-notes.txt b/Documentation/arm/VFP/release-notes.txt
index f28e022..28a2795 100644
--- a/Documentation/arm/VFP/release-notes.txt
+++ b/Documentation/arm/VFP/release-notes.txt
@@ -12,7 +12,7 @@ This release has been validated against
John R. Hauser using the TestFloat-2a test suite. Details of this
library and test suite can be found at:
- http://www.cs.berkeley.edu/~jhauser/arithmetic/SoftFloat.html
+ http://www.jhauser.us/arithmetic/SoftFloat.html
The operations which have been tested with this package are:
diff --git a/Documentation/arm/memory.txt b/Documentation/arm/memory.txt
index 4b1c93a..dc60455 100644
--- a/Documentation/arm/memory.txt
+++ b/Documentation/arm/memory.txt
@@ -1,7 +1,7 @@
Kernel Memory Layout on ARM Linux
Russell King <rmk@arm.linux.org.uk>
- May 21, 2004 (2.6.6)
+ November 17, 2005 (2.6.15)
This document describes the virtual memory layout which the Linux
kernel uses for ARM processors. It indicates which regions are
@@ -37,6 +37,8 @@ ff000000 ffbfffff Reserved for future ex
mapping region.
VMALLOC_END feffffff Free for platform use, recommended.
+ VMALLOC_END must be aligned to a 2MB
+ boundary.
VMALLOC_START VMALLOC_END-1 vmalloc() / ioremap() space.
Memory returned by vmalloc/ioremap will
diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 8eedaa2..23a1c24 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -115,6 +115,33 @@ boolean is return which indicates whethe
is negative. It requires explicit memory barrier semantics around the
operation.
+Then:
+
+ int atomic_cmpxchg(atomic_t *v, int old, int new);
+
+This performs an atomic compare exchange operation on the atomic value v,
+with the given old and new values. Like all atomic_xxx operations,
+atomic_cmpxchg will only satisfy its atomicity semantics as long as all
+other accesses of *v are performed through atomic_xxx operations.
+
+atomic_cmpxchg requires explicit memory barriers around the operation.
+
+The semantics for atomic_cmpxchg are the same as those defined for 'cas'
+below.
+
+Finally:
+
+ int atomic_add_unless(atomic_t *v, int a, int u);
+
+If the atomic value v is not equal to u, this function adds a to v, and
+returns non zero. If v is equal to u then it returns zero. This is done as
+an atomic operation.
+
+atomic_add_unless requires explicit memory barriers around the operation.
+
+atomic_inc_not_zero, equivalent to atomic_add_unless(v, 1, 0)
+
+
If a caller requires memory barrier semantics around an atomic_t
operation which does not return a value, a set of interfaces are
defined which accomplish this:
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6dd274d..0fe01c8 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -906,9 +906,20 @@ Aside:
4. The I/O scheduler
-I/O schedulers are now per queue. They should be runtime switchable and modular
-but aren't yet. Jens has most bits to do this, but the sysfs implementation is
-missing.
+I/O scheduler, a.k.a. elevator, is implemented in two layers. Generic dispatch
+queue and specific I/O schedulers. Unless stated otherwise, elevator is used
+to refer to both parts and I/O scheduler to specific I/O schedulers.
+
+Block layer implements generic dispatch queue in ll_rw_blk.c and elevator.c.
+The generic dispatch queue is responsible for properly ordering barrier
+requests, requeueing, handling non-fs requests and all other subtleties.
+
+Specific I/O schedulers are responsible for ordering normal filesystem
+requests. They can also choose to delay certain requests to improve
+throughput or whatever purpose. As the plural form indicates, there are
+multiple I/O schedulers. They can be built as modules but at least one should
+be built inside the kernel. Each queue can choose different one and can also
+change to another one dynamically.
A block layer call to the i/o scheduler follows the convention elv_xxx(). This
calls elevator_xxx_fn in the elevator switch (drivers/block/elevator.c). Oh,
@@ -921,44 +932,36 @@ keeping work.
The functions an elevator may implement are: (* are mandatory)
elevator_merge_fn called to query requests for merge with a bio
-elevator_merge_req_fn " " " with another request
+elevator_merge_req_fn called when two requests get merged. the one
+ which gets merged into the other one will be
+ never seen by I/O scheduler again. IOW, after
+ being merged, the request is gone.
elevator_merged_fn called when a request in the scheduler has been
involved in a merge. It is used in the deadline
scheduler for example, to reposition the request
if its sorting order has changed.
-*elevator_next_req_fn returns the next scheduled request, or NULL
- if there are none (or none are ready).
+elevator_dispatch_fn fills the dispatch queue with ready requests.
+ I/O schedulers are free to postpone requests by
+ not filling the dispatch queue unless @force
+ is non-zero. Once dispatched, I/O schedulers
+ are not allowed to manipulate the requests -
+ they belong to generic dispatch queue.
-*elevator_add_req_fn called to add a new request into the scheduler
+elevator_add_req_fn called to add a new request into the scheduler
elevator_queue_empty_fn returns true if the merge queue is empty.
Drivers shouldn't use this, but rather check
if elv_next_request is NULL (without losing the
request if one exists!)
-elevator_remove_req_fn This is called when a driver claims ownership of
- the target request - it now belongs to the
- driver. It must not be modified or merged.
- Drivers must not lose the request! A subsequent
- call of elevator_next_req_fn must return the
- _next_ request.
-
-elevator_requeue_req_fn called to add a request to the scheduler. This
- is used when the request has alrnadebeen
- returned by elv_next_request, but hasn't
- completed. If this is not implemented then
- elevator_add_req_fn is called instead.
-
elevator_former_req_fn
elevator_latter_req_fn These return the request before or after the
one specified in disk sort order. Used by the
block layer to find merge possibilities.
-elevator_completed_req_fn called when a request is completed. This might
- come about due to being merged with another or
- when the device completes the request.
+elevator_completed_req_fn called when a request is completed.
elevator_may_queue_fn returns true if the scheduler wants to allow the
current context to queue a new request even if
@@ -967,13 +970,33 @@ elevator_may_queue_fn returns true if t
elevator_set_req_fn
elevator_put_req_fn Must be used to allocate and free any elevator
- specific storate for a request.
+ specific storage for a request.
+
+elevator_activate_req_fn Called when device driver first sees a request.
+ I/O schedulers can use this callback to
+ determine when actual execution of a request
+ starts.
+elevator_deactivate_req_fn Called when device driver decides to delay
+ a request by requeueing it.
elevator_init_fn
elevator_exit_fn Allocate and free any elevator specific storage
for a queue.
-4.2 I/O scheduler implementation
+4.2 Request flows seen by I/O schedulers
+All requests seens by I/O schedulers strictly follow one of the following three
+flows.
+
+ set_req_fn ->
+
+ i. add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn ->
+ (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn
+ ii. add_req_fn -> (merged_fn ->)* -> merge_req_fn
+ iii. [none]
+
+ -> put_req_fn
+
+4.3 I/O scheduler implementation
The generic i/o scheduler algorithm attempts to sort/merge/batch requests for
optimal disk scan and request servicing performance (based on generic
principles and device capabilities), optimized for:
@@ -993,18 +1016,7 @@ request in sort order to prevent binary
This arrangement is not a generic block layer characteristic however, so
elevators may implement queues as they please.
-ii. Last merge hint
-The last merge hint is part of the generic queue layer. I/O schedulers must do
-some management on it. For the most part, the most important thing is to make
-sure q->last_merge is cleared (set to NULL) when the request on it is no longer
-a candidate for merging (for example if it has been sent to the driver).
-
-The last merge performed is cached as a hint for the subsequent request. If
-sequential data is being submitted, the hint is used to perform merges without
-any scanning. This is not sufficient when there are multiple processes doing
-I/O though, so a "merge hash" is used by some schedulers.
-
-iii. Merge hash
+ii. Merge hash
AS and deadline use a hash table indexed by the last sector of a request. This
enables merging code to quickly look up "back merge" candidates, even when
multiple I/O streams are being performed at once on one disk.
@@ -1013,29 +1025,8 @@ multiple I/O streams are being performed
are far less common than "back merges" due to the nature of most I/O patterns.
Front merges are handled by the binary trees in AS and deadline schedulers.
-iv. Handling barrier cases
-A request with flags REQ_HARDBARRIER or REQ_SOFTBARRIER must not be ordered
-around. That is, they must be processed after all older requests, and before
-any newer ones. This includes merges!
-
-In AS and deadline schedulers, barriers have the effect of flushing the reorder
-queue. The performance cost of this will vary from nothing to a lot depending
-on i/o patterns and device characteristics. Obviously they won't improve
-performance, so their use should be kept to a minimum.
-
-v. Handling insertion position directives
-A request may be inserted with a position directive. The directives are one of
-ELEVATOR_INSERT_BACK, ELEVATOR_INSERT_FRONT, ELEVATOR_INSERT_SORT.
-
-ELEVATOR_INSERT_SORT is a general directive for non-barrier requests.
-ELEVATOR_INSERT_BACK is used to insert a barrier to the back of the queue.
-ELEVATOR_INSERT_FRONT is used to insert a barrier to the front of the queue, and
-overrides the ordering requested by any previous barriers. In practice this is
-harmless and required, because it is used for SCSI requeueing. This does not
-require flushing the reorder queue, so does not impose a performance penalty.
-
-vi. Plugging the queue to batch requests in anticipation of opportunities for
- merge/sort optimizations
+iii. Plugging the queue to batch requests in anticipation of opportunities for
+ merge/sort optimizations
This is just the same as in 2.4 so far, though per-device unplugging
support is anticipated for 2.5. Also with a priority-based i/o scheduler,
@@ -1069,11 +1060,11 @@ Aside:
blk_kick_queue() to unplug a specific queue (right away ?)
or optionally, all queues, is in the plan.
-4.3 I/O contexts
+4.4 I/O contexts
I/O contexts provide a dynamically allocated per process data area. They may
be used in I/O schedulers, and in the block layer (could be used for IO statis,
-