Differences

This shows you the differences between two versions of the page.

--- soc:2008:mdeck:journal:week7 [2008/07/09 14:18]
mdeck
+++ soc:2008:mdeck:journal:week7 [2008/07/13 12:23] (current)
mdeck
@@ Line 30: / Line 30: @@
 In the meantime, analyzing wireshark output should show any problems with rx & tx during HTTP booting.  I may also play with GDB a bit more to figure out what's going on, but currently I need to nail down the bug to something more specific.
+=== 10 July ===
+This morning I installed wireshark and have been inspecting HTTP boot packet communications.  I found a number of duplicate transmissions (including duplication of TCP sequence numbers.)  It seemed something was wrong with the transmission path.
+I added a few debug lines to ''ifec_tx_wake()'':
+<file>
+void ifec_tx_wake ( struct net_device *netdev ) {
+	struct ifec_private *priv   = netdev->priv;
+	unsigned long        ioaddr = priv->ioaddr;
+	struct ifec_active  *a      = priv->active;
+	struct ifec_tcb     *tcb    = a->tcb_head->next;
+	/* For the special case of the first transmit, we issue a START. The
+	 * card won't RESUME after the configure command. */
+	if ( a->configured ) {
+		a->configured = 0;
+		ifec_scb_cmd ( netdev, virt_to_bus ( tcb ), CUStart );
+		ifec_scb_cmd_wait ( netdev );
+		return;
+	}
+/* if not suspended, and all other tcbs have suspend flag clear, do NOT clear
+ * the suspend flag.  if you do, it will enter a bad state.  we need a tcb with
+ * a suspend flag set in the tx ring at all times. */
+	/* Resume if suspended. */
+	switch ( ( inw ( ioaddr + SCBStatus ) >> 6 ) & 0x3 ) {
+	case 0:  /* Idle - We should not reach this state. */
+		DBG ( "ifec_net_transmit: tx idle!\n" );
+		ifec_scb_cmd ( netdev, virt_to_bus ( tcb ), CUStart );
+		ifec_scb_cmd_wait ( netdev );
+		break;
+	case 1:  /* Suspended */
+		DBG ( "s" ); //ifec_net_transmit: tx suspended : resume issued\n" );
+		ifec_scb_cmd_wait ( netdev );
+		outl ( 0, ioaddr + SCBPointer );
+		a->tcb_head->command &= ~CmdSuspend;
+		/* Immediately issue Resume command */
+		outb ( CUResume, ioaddr + SCBCmd );
+		ifec_scb_cmd_wait ( netdev );
+		break;
+	default:
+		DBG ( "a" );
+		a->tcb_head->command &= ~CmdSuspend;
+	}
+}
+</file>
+This way I could see what state the Command Unit was in prior to each tx.  Comparing this debug output with the wireshark output, I found that every instance of an 'a' coincided with a duplicate packet transmission.
+Now, the same packet being transmitted twice is odd.  The driver is setup to write into the next TCB in the tx ring for each transmit call.  I added a debug line in ''ifec_net_transmit()'':
+<file>
+static int ifec_net_transmit ( struct net_device *netdev,
+                               struct io_buffer *iobuf ) {
+	struct ifec_private *priv   = netdev->priv;
+	unsigned long        ioaddr = priv->ioaddr;
+	struct ifec_active  *a      = priv->active;
+	struct ifec_tcb     *tcb    = a->tcb_head->next;
+	unsigned short status;
+	/* Wait for TCB to become available. */
+	if ( tcb->status || tcb->iob ) {
+		DBGP ( "TX overflow\n" );
+		return -ENOBUFS;
+	}
+	status = inw ( ioaddr + SCBStatus );
+	/* Acknowledge all of the current interrupt sources ASAP. */
+	outw ( status & 0xfc00, ioaddr + SCBStatus );
+	DBGIO ( "transmitting packet (%d bytes). status = %hX, cmd=%hX\n",
+		iob_len ( iobuf ), status, inw ( ioaddr + SCBCmd ) );
+	DBGIO_HD ( iobuf->data, iob_len ( iobuf ) );
+	tcb->command   = CmdSuspend | CmdTx | CmdTxFlex;
+	tcb->count     = 0x01208000;
+	tcb->tbd_addr0 = virt_to_bus ( iobuf->data );
+	tcb->tbd_size0 = 0x3FFF & iob_len ( iobuf );
+	tcb->iob = iobuf;
+	DBG ( "%i", tcb - a->tcbs );
+	DBGIO ( "tcb: \n" );
+	DBGIO_HD ( tcb, sizeof ( *tcb ) );
+	ifec_tx_wake ( netdev );
+	/* Append to end of ring. */
+	a->tcb_head = tcb;
+	return 0;
+}
+</file>
+The line ''DBG ( "%i", tcb - a->tcbs );'' prints out the index of the current TCB in the tx ring.  The debug output showed proper circulation from 0 through 3 and back to 0 repeatedly.  However, it also showed no duplicates in wireshark!
+From this behavior, I made the assumption that the time delay of printing the debug output at that point prevents the 'a' condition from ever occuring.  This, in turn, prevents the duplication bug.  The 'a' condition is the CU being in the active state, which occurs when a transmit request occurs quickly before the previous tx finished processing on the card.
+Thus, I now have nailed down at least //one// bug, and now I can determine what's going wrong.
+  * [[http://git.etherboot.org/?p=people/mdeck/gpxe.git;a=commit;h=9f561a19282078cc0346487d2a2b34060e1a3f62|[Drivers-eepro100] Bug fixes]]
+The end of ''ifec_tx_wake()'' performs different operations depending if the state of the CU is active or suspended.  After some consideration, it seems if the CU is active, a RESUME should still be issued - this will cause the CU to re-read the current TCB's S-bit.  Thus, after clearing that bit, the CU will continue on and process this newly appended transmit command.
+Otherwise, if the card was active before the tx, then it would suspend before processing the new TCB.  This means the card is suspended at a TCB prior to the ''tcb_head''.  This could happen multiple times, moving the actual TCB suspended closer to ''tcb_tail''.  I think eventually tail would surpass the suspended TCB, and the head may write into the next TCB which is transmitted at the next ''ifec_net_transmit()''.  This is speculation, as there may be some other way this corruption was occurring.
+The bottom of ''ifec_tx_wake()'' was changed as such:
+<file>
+	/* Resume if suspended. */
+	switch ( ( inw ( ioaddr + SCBStatus ) >> 6 ) & 0x3 ) {
+	case 0:  /* Idle - We should not reach this state. */
+		DBG ( "\nifec_net_transmit: tx idle!\n" );
+		ifec_scb_cmd ( netdev, virt_to_bus ( tcb ), CUStart );
+		ifec_scb_cmd_wait ( netdev );
+		return;
+	case 1:  /* Suspended */
+		DBG ( "s" );
+		break;
+	default: /* Active */
+		DBG ( "a" );
+	}
+	ifec_scb_cmd_wait ( netdev );
+	outl ( 0, ioaddr + SCBPointer );
+	a->tcb_head->command &= ~CmdSuspend;
+	/* Immediately issue Resume command */
+	outb ( CUResume, ioaddr + SCBCmd );
+	ifec_scb_cmd_wait ( netdev );
+}
+</file>
+As you can see, the RESUME is issued even if the card is active.
+Additionally, I removed a line from ''ifec_tx_process()'':
+<file>
+static void ifec_tx_process ( struct net_device *netdev ) {
+	struct ifec_private *priv   = netdev->priv;
+	struct ifec_tcb     *tcb    = priv->active->tcb_tail;
+	s16           status;
+	/* Check status of transmitted packets */
+	while ( ( status = tcb->status ) && tcb->iob ) {
+		if ( status & TCB_U ) {
+			DBG ( "ifec_tx_process : tx error!\n " );
+			netdev_tx_complete_err ( netdev, tcb->iob, -ENOMEM );
+		} else {
+			netdev_tx_complete ( netdev, tcb->iob );
+		}
+		DBGIO ( "tx completion\n" );
+		tcb->iob = NULL;
+		tcb->status = 0;
+//		tcb->command &= ~CmdSuspend;	/* Allow controller to resume. */
+		priv->active->tcb_tail = tcb->next;	/* Next TCB */
+		tcb = tcb->next;
+	}
+}
+</file>
+This ensures the suspend bit isn't cleared except in the ''ifec_tx_wake()'' routine.  This line was redundant.
+=== 13 July ===
+In lieu of having iSCSI packet captures to look at, I decided to try booting over AoE.  This involves sufficient driver activity that I hope to locate a bug via it.
+Booting a Windows image over AoE got stuck at the Windows splash screen.  I then tried booting this image using Safe Mode.  Every .sys driver loads until it gets to aoe32.sys.  The system freezes at this line.  I don't know enough about the AoE driver to determine what could be causing this.
+I then compiled and attempted the same AoE boot using the legacy eepro100 driver.  The boot sequence was exactly the same, with the machine freezing once loading aoe32.sys.  I'll need to get a working AoE image to test this properly.

Trace:

Differences

Navigation

Search

Toolbox

QR Code