[dev][nand][aml-rawnand] Fix uncorrectable ECC handling.

1) Fix handling of uncorrectable ECC errors, For non-randomized
data, uncorrectable ECC errors result in an error propagated up
to the NAND protocol, which retries the read thrice. For randomized
data a further check is necessary (which was previously wrong - reversed).
2) Add verbose error logging to the code handling uncorrectable ECC errors.
3) Fix error handling for the case where we timeout waiting for a read
interrupt.
4) Populate the ECC strength from the Page0 read (the ECC strength is
used in the handling of ECC uncorrectable errors for the randomized
case - not applicable to us right now, but might be later).

Bug: ZX-2616.

Test: Use Ruchira's code to do SDIO reads at intervals of our choice
to generate bus loading, and use Ricardo's tester for data validation.

Change-Id: I1be58d291f872cb91018aabd3c0673caa109c507
diff --git a/system/dev/nand/aml-rawnand/aml-rawnand.c b/system/dev/nand/aml-rawnand/aml-rawnand.c
index 68592f1..af93c26 100644
--- a/system/dev/nand/aml-rawnand/aml-rawnand.c
+++ b/system/dev/nand/aml-rawnand/aml-rawnand.c
@@ -34,7 +34,7 @@
 static const uint32_t chipsel[2] = {NAND_CE0, NAND_CE1};
 
 struct aml_controller_params aml_params = {
-    8,
+    8, /* Overwritten using BCH setting from page0 */
     2,
     /* The 2 following values are overwritten by page0 contents */
     1,                /* rand-mode is 1 for page0 */
@@ -100,6 +100,36 @@
     return ecc_page;
 }
 
+int aml_get_ecc_strength(uint32_t ecc_mode) {
+    int ecc_strength;
+
+    switch (ecc_mode) {
+    case AML_ECC_BCH8:
+    case AML_ECC_BCH8_1K:
+        ecc_strength = 8;
+        break;
+    case AML_ECC_BCH24_1K:
+        ecc_strength = 24;
+        break;
+    case AML_ECC_BCH30_1K:
+        ecc_strength = 30;
+        break;
+    case AML_ECC_BCH40_1K:
+        ecc_strength = 40;
+        break;
+    case AML_ECC_BCH50_1K:
+        ecc_strength = 50;
+        break;
+    case AML_ECC_BCH60_1K:
+        ecc_strength = 60;
+        break;
+    default:
+        ecc_strength = -1;
+        break;
+    }
+    return ecc_strength;
+}
+
 static void aml_cmd_idle(aml_raw_nand_t* raw_nand, uint32_t time) {
     uint32_t cmd = 0;
     volatile uint8_t* reg = (volatile uint8_t*)
@@ -269,7 +299,8 @@
  * Returns the maximum bitflips corrected on this NAND page
  * (the maximum bitflips across all of the ECC pages in this page).
  */
-static int aml_get_ecc_corrections(aml_raw_nand_t* raw_nand, int ecc_pages) {
+static int aml_get_ecc_corrections(aml_raw_nand_t* raw_nand, int ecc_pages,
+                                   uint32_t nand_page) {
     struct aml_info_format* info;
     int bitflips = 0;
     uint8_t zero_cnt;
@@ -277,6 +308,11 @@
     for (int i = 0; i < ecc_pages; i++) {
         info = aml_info_ptr(raw_nand, i);
         if (info->ecc.eccerr_cnt == AML_ECC_UNCORRECTABLE_CNT) {
+            if (!raw_nand->controller_params.rand_mode) {
+                zxlogf(ERROR, "%s: ECC failure (non-randomized)@%u\n", __func__, nand_page);
+                raw_nand->stats.failed++;
+                return ECC_CHECK_RETURN_FF;
+            }
             /*
              * Why are we checking for zero_cnt here ?
              * Per Amlogic HW architect, this is to deal with
@@ -287,15 +323,19 @@
              * blank page.
              */
             zero_cnt = info->zero_cnt & AML_ECC_UNCORRECTABLE_CNT;
-            if (raw_nand->controller_params.rand_mode &&
-                (zero_cnt < raw_nand->controller_params.ecc_strength)) {
-                zxlogf(ERROR, "%s: Returning ECC failure\n",
-                       __func__);
+            if (zero_cnt >= raw_nand->controller_params.ecc_strength) {
+                zxlogf(ERROR, "%s: ECC failure (randomized)@%u zero_cnt=%u\n",
+                       __func__, nand_page, zero_cnt);
+                raw_nand->stats.failed++;
                 return ECC_CHECK_RETURN_FF;
             }
-            raw_nand->stats.failed++;
+            zxlogf(ERROR, "%s: Blank Page@%u\n", __func__, nand_page);
             continue;
         }
+        if (info->ecc.eccerr_cnt != 0) {
+            zxlogf(INFO, "%s: Corrected %u ECC errors@%u\n",
+                   __func__, info->ecc.eccerr_cnt, nand_page);
+        }
         raw_nand->stats.ecc_corrected += info->ecc.eccerr_cnt;
         bitflips = MAX(bitflips, info->ecc.eccerr_cnt);
     }
@@ -506,7 +546,11 @@
                __func__, status);
         return status;
     }
-    aml_queue_rb(raw_nand);
+    status = aml_queue_rb(raw_nand);
+    if (status != ZX_OK) {
+        zxlogf(ERROR, "%s: aml_queue_rb failed %d\n", __func__, status);
+        return ZX_ERR_IO;
+    }
     status = aml_check_ecc_pages(raw_nand, ecc_pages);
     if (status != ZX_OK) {
         zxlogf(ERROR, "%s: aml_check_ecc_pages failed %d\n",
@@ -524,7 +568,7 @@
     }
     if (oob != NULL)
         status = aml_get_oob_byte(raw_nand, oob);
-    ecc_c = aml_get_ecc_corrections(raw_nand, ecc_pages);
+    ecc_c = aml_get_ecc_corrections(raw_nand, ecc_pages, nand_page);
     if (ecc_c < 0) {
         zxlogf(ERROR, "%s: Uncorrectable ECC error on read\n",
                __func__);
@@ -863,6 +907,15 @@
         (page0->nand_setup.cfg.d32 >> 19) & 0x1;
     raw_nand->controller_params.bch_mode =
         (page0->nand_setup.cfg.d32 >> 14) & 0x7;
+
+    raw_nand->controller_params.ecc_strength =
+        aml_get_ecc_strength(raw_nand->controller_params.bch_mode);
+    if (raw_nand->controller_params.ecc_strength < 0) {
+        zxlogf(INFO, "%s: BAD ECC strength computed from BCH Mode\n", __func__);
+        free(data);
+        return ZX_ERR_BAD_STATE;
+    }
+
     zxlogf(INFO, "%s: NAND BCH Mode is %s\n", __func__,
            aml_ecc_string(raw_nand->controller_params.bch_mode));
     free(data);