5.2 Import Checklist

คุณได้ CSV จากทีมอื่น เปิดใน Sheets ดูเหมือนปกติ import เข้า pandas ก็โอเค load เข้า SQL ก็ไม่ error — แต่ 2 สัปดาห์ต่อมา ตัวเลขไม่ตรง เพราะไม่มีใครตรวจ ก่อน import

เวลาที่ใช้: ~12 นาที

Import Checklist

ทุกครั้งที่ได้ CSV ใหม่ ต้องตรวจ 5 สิ่ง ก่อนใช้งาน:

Check 1: Row Count ตรงไหม?

ไฟล์ต้นทางมีกี่แถว? ไฟล์ที่ import เข้ามามีกี่แถว? ถ้าไม่เท่ากัน — แถวหายระหว่างทาง

Check 2: Nulls ต่อ Column ตรงตาม Contract ไหม?

Column ที่กำหนดว่า NOT NULL มีค่าว่างอยู่ไหม? Column ที่ nullable มี null กี่แถว — สมเหตุสมผลไหม?

Check 3: IDs มี Leading Zeros ครบไหม?

student_id = "001234" กลายเป็น 1234 หรือเปล่า? ถ้า contract บอกว่า 6 หลัก ต้องมี 6 หลักจริง

Check 4: Dates ถูก Parse เป็น Date จริงไหม?

enrolled_on เป็น date object จริงหรือยังเป็น string? "2025-01-15" ต้องไม่ใช่แค่ text

Check 5: Totals ตรงกับ Source ไหม?

SUM(price) ในไฟล์ import ต้องตรงกับ SUM(price) จากต้นทาง ถ้าต่างกัน → อาจมี rounding errors จาก float

ลองตรวจจริง

สมมติ import CSV เข้ามาใน Sheet1 แล้ว:

// Check 1: Row count
=COUNTA(A2:A1000)
// เปรียบเทียบกับจำนวนแถวที่ต้นทางบอก

// Check 2: Nulls per column
=COUNTBLANK(C2:C1000)         // phone column — ตรงกับที่คาดไหม?
=COUNTBLANK(D2:D1000)         // price column — ต้องเป็น 0!

// Check 3: Leading zeros
=LEN(A2)                      // student_id ต้องได้ 6
=COUNTIF(A2:A1000,"<>??????") // นับแถวที่ไม่ใช่ 6 ตัวอักษร

// Check 4: Dates
=ISNUMBER(E2)                 // TRUE = date, FALSE = text
=COUNTIF(E2:E1000,FALSE)      // นับ cells ที่ยังเป็น text

// Check 5: Totals
=SUM(D2:D1000)                // เปรียบเทียบกับ source total

ใน C คุณต้องเขียน validation function เอง:

#include <stdio.h>
#include <string.h>
#include <stdbool.h>
#include <stdlib.h>

typedef struct {
    char student_id[7];
    char name[101];
    char phone[21];
    double price;
    char enrolled_on[11];
    bool active;
} Student;

// Check 3: Leading zeros
bool validate_id(const char *id) {
    if (strlen(id) != 6) {
        printf("ERROR: ID '%s' is not 6 chars\n", id);
        return false;
    }
    for (int i = 0; i < 6; i++) {
        if (id[i] < '0' || id[i] > '9') {
            printf("ERROR: ID '%s' has non-digit\n", id);
            return false;
        }
    }
    return true;
}

// Check 2: Null check (in C, "null" = empty string)
bool validate_not_null(const char *field, const char *name) {
    if (strlen(field) == 0) {
        printf("ERROR: %s is empty (NOT NULL violated)\n", name);
        return false;
    }
    return true;
}

// Check 5: price >= 0
bool validate_price(double price) {
    if (price < 0) {
        printf("ERROR: price %.2f is negative\n", price);
        return false;
    }
    return true;
}

int main() {
    Student s = {"001234", "Somchai", "+6612345678", 2500.00, "2025-01-15", true};

    printf("ID valid: %d\n", validate_id(s.student_id));
    printf("Name valid: %d\n", validate_not_null(s.name, "name"));
    printf("Price valid: %d\n", validate_price(s.price));
    return 0;
}

pandas ทำ checklist ได้ง่ายที่สุด:

import pandas as pd

# อ่าน CSV ตาม contract
df = pd.read_csv('students.csv', dtype={
    'student_id': 'string',
    'name': 'string',
    'phone': 'string',
    'price': 'Float64',
    'enrolled_on': 'string',
    'active': 'boolean',
})
df['enrolled_on'] = pd.to_datetime(df['enrolled_on'])

# --- CHECK 1: Row count ---
expected_rows = 1000  # ตัวเลขจาก source
actual_rows = len(df)
print(f"Row count: {actual_rows} / {expected_rows}",
      "OK" if actual_rows == expected_rows else "MISMATCH!")

# --- CHECK 2: Nulls per column ---
print("\nNulls per column:")
print(df.isnull().sum())
# ตรวจว่า NOT NULL columns มี null = 0

# --- CHECK 3: Leading zeros ---
bad_ids = df[df['student_id'].str.len() != 6]
print(f"\nIDs without 6 chars: {len(bad_ids)}")
if len(bad_ids) > 0:
    print(bad_ids['student_id'].head())

# --- CHECK 4: Dates parsed ---
print(f"\nenrolled_on dtype: {df['enrolled_on'].dtype}")
# ต้องเป็น datetime64 ไม่ใช่ object

# --- CHECK 5: Totals ---
source_total = 2_500_000.00  # ตัวเลขจาก source
actual_total = df['price'].sum()
diff = abs(actual_total - source_total)
print(f"\nTotal: {actual_total:.2f} / {source_total:.2f}")
print(f"Diff: {diff:.2f}", "OK" if diff < 0.01 else "MISMATCH!")

SQL ตรวจได้หลัง COPY หรือ INSERT:

-- สมมติ load ข้อมูลเข้าตาราง students แล้ว

-- CHECK 1: Row count
SELECT COUNT(*) AS row_count FROM students;
-- เปรียบเทียบกับ source

-- CHECK 2: Nulls per column
SELECT
    COUNT(*) - COUNT(student_id) AS null_student_id,
    COUNT(*) - COUNT(name)       AS null_name,
    COUNT(*) - COUNT(phone)      AS null_phone,
    COUNT(*) - COUNT(price)      AS null_price,
    COUNT(*) - COUNT(enrolled_on) AS null_enrolled_on
FROM students;
-- null_student_id, null_name, null_price, null_enrolled_on ต้องเป็น 0

-- CHECK 3: Leading zeros (IDs ที่ไม่ใช่ 6 หลัก)
SELECT student_id, LENGTH(student_id)
FROM students
WHERE LENGTH(student_id) != 6;
-- ต้องได้ 0 แถว

-- CHECK 4: Date validation
-- SQL บังคับ DATE type ตั้งแต่ CREATE TABLE
-- ถ้า insert string ที่ไม่ใช่ date → ERROR ทันที

-- CHECK 5: Totals
SELECT SUM(price) AS total_price FROM students;
-- เปรียบเทียบกับ source total

Checklist สรุป

#	ตรวจอะไร	ถ้าพบปัญหา
1	Row count ตรง	แถวหาย → ตรวจ encoding, delimiter
2	Nulls per column	NOT NULL column มี null → data ไม่ครบ
3	Leading zeros ครบ	ID สั้นกว่าที่ควร → import เป็น text ไม่ใช่ number
4	Dates ถูก parse	ยังเป็น string → ใช้ parse_dates หรือ TO_DATE
5	Totals ตรง	ยอดต่าง → ตรวจ float vs decimal rounding

ลองทำ

สมมติได้ CSV ที่มี 500 แถว, student_id 6 หลัก, price รวมเท่ากับ 1,250,000.00 — เขียน checklist script ใน Python หรือ SQL queries ที่ตรวจทั้ง 5 ข้อ
ลองจงใจทำให้ check ล้มเหลว — เช่น เปลี่ยน student_id ให้สั้นกว่า 6 หลัก — แล้วดูว่า script ตรวจเจอไหม

ดูตัวอย่าง script

def run_import_checklist(df, expected_rows, expected_total, id_col, id_length, total_col, date_cols):
    """รัน 5-point checklist สำหรับ CSV import"""
    issues = []

    # 1. Row count
    if len(df) != expected_rows:
        issues.append(f"Row count: {len(df)} != {expected_rows}")

    # 2. Nulls per column
    for col in df.columns:
        nulls = df[col].isnull().sum()
        if nulls > 0:
            issues.append(f"Column '{col}' has {nulls} nulls")

    # 3. Leading zeros
    bad = df[df[id_col].str.len() != id_length]
    if len(bad) > 0:
        issues.append(f"{len(bad)} IDs not {id_length} chars")

    # 4. Dates
    for col in date_cols:
        if df[col].dtype == 'object':
            issues.append(f"'{col}' is still string, not datetime")

    # 5. Totals
    diff = abs(df[total_col].sum() - expected_total)
    if diff > 0.01:
        issues.append(f"Total diff: {diff:.2f}")

    if issues:
        print("ISSUES FOUND:")
        for i in issues:
            print(f"  - {i}")
    else:
        print("ALL CHECKS PASSED")

    return len(issues) == 0