import os
import math
import datetime
import xml.etree.ElementTree as ET
from xml.dom import minidom # For pretty printing XML

# --- Configuration ---
SOURCE_HTML_DIR = r"./job"
OUTPUT_DIR = r"./"
BASE_URL = "https://jobseekusa.nichesite.org/job/"
MAX_URLS_PER_SITEMAP = 3000
SITEMAP_FILENAME_PREFIX = "sitemap"
SITEMAP_INDEX_FILENAME = "sitemap_index.xml"

INCLUDE_CHANGEFREQ_PRIORITY = True
DEFAULT_CHANGEFREQ = "weekly"
DEFAULT_PRIORITY = "0.8"

# --- NEW: Configuration to force current date for lastmod ---
# SET THIS TO True TO USE CURRENT DATE FOR ALL LASTMOD, OTHERWISE SET TO False
FORCE_CURRENT_DATE_FOR_LASTMOD = True

# --- NEW: Configuration to only include files modified today ---
# SET THIS TO True TO ONLY INCLUDE FILES MODIFIED ON THE CURRENT DAY (IN UTC)
SITEMAP_ONLY_TODAY_URLS = False # <<< CHANGE THIS TO True for the new behavior

# --- End Configuration ---

# --- Helper Functions ---
def find_html_files_generator(directory):
    """
    A generator that yields HTML file paths one by one using os.scandir for memory efficiency.
    """
    print(f"Scanning directory: {directory}")
    try:
        for entry in os.scandir(directory):
            # Use is_file() to avoid an extra stat call, and check extension
            if entry.is_file() and entry.name.lower().endswith(".html"):
                yield entry.path
    except FileNotFoundError:
        print(f"Error: Source directory not found: {directory}")
        # Yield nothing to signal an empty sequence
    except Exception as e:
        print(f"Error scanning directory {directory}: {e}")

def format_datetime_w3c(dt_object): # Modified to accept a datetime object
    """Formats a datetime object into W3C Datetime format (UTC)."""
    return dt_object.strftime('%Y-%m-%dT%H:%M:%SZ')

def prettify_xml(elem):
    """Returns a pretty-printed XML string for the Element."""
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent="  ", encoding='utf-8')

# --- Main Logic (Optimized for Memory) ---
def generate_sitemaps():
    if not os.path.exists(OUTPUT_DIR):
        print(f"Output directory '{OUTPUT_DIR}' not found. Creating it...")
        try:
            os.makedirs(OUTPUT_DIR)
            print(f"Created output directory: {OUTPUT_DIR}")
        except OSError as e:
            print(f"Error creating output directory '{OUTPUT_DIR}': {e}")
            return

    # Get the current UTC datetime and date ONCE for consistency
    current_utc_datetime = datetime.datetime.now(datetime.timezone.utc)
    current_w3c_time_str = format_datetime_w3c(current_utc_datetime)
    current_utc_date = current_utc_datetime.date()

    if FORCE_CURRENT_DATE_FOR_LASTMOD:
        print(f"WARNING: Forcing all <lastmod> dates to current generation time: {current_w3c_time_str}")

    if SITEMAP_ONLY_TODAY_URLS:
        print(f"WARNING: SITEMAP_ONLY_TODAY_URLS is enabled. Only including files modified on {current_utc_date} (UTC).")

    html_file_generator = find_html_files_generator(SOURCE_HTML_DIR)
    generated_sitemap_files = []
    total_files_processed = 0
    total_files_skipped = 0
    sitemap_num = 1
    
    while True:
        urlset_attrs = {"xmlns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
        urlset = ET.Element("urlset", attrib=urlset_attrs)
        
        file_count_in_batch = 0
        
        # Collect files for the current sitemap batch
        for html_filepath in html_file_generator:
            try:
                last_mod_timestamp = os.path.getmtime(html_filepath)

                # >>> MODIFICATION FOR TODAY-ONLY FEATURE <<<
                if SITEMAP_ONLY_TODAY_URLS:
                    file_mod_date = datetime.datetime.utcfromtimestamp(last_mod_timestamp).date()
                    if file_mod_date != current_utc_date:
                        total_files_skipped += 1
                        continue # Skip this file if it wasn't modified today

                if file_count_in_batch == 0: # First file for this new sitemap
                     print(f"\nGenerating {SITEMAP_FILENAME_PREFIX}{sitemap_num}.xml...")

                filename_with_ext = os.path.basename(html_filepath)
                loc_url = f"{BASE_URL}{filename_with_ext}"

                if FORCE_CURRENT_DATE_FOR_LASTMOD:
                    last_mod_w3c = current_w3c_time_str
                else:
                    dt_object = datetime.datetime.utcfromtimestamp(last_mod_timestamp)
                    last_mod_w3c = format_datetime_w3c(dt_object)

                url_element = ET.SubElement(urlset, "url")
                ET.SubElement(url_element, "loc").text = loc_url
                ET.SubElement(url_element, "lastmod").text = last_mod_w3c
                
                if INCLUDE_CHANGEFREQ_PRIORITY:
                    ET.SubElement(url_element, "changefreq").text = DEFAULT_CHANGEFREQ
                    ET.SubElement(url_element, "priority").text = DEFAULT_PRIORITY
                
                file_count_in_batch += 1
                total_files_processed += 1
                
            except Exception as e:
                print(f"  Warning: Could not process file '{html_filepath}' for sitemap entry: {e}")

            # If the batch is full, break to write the file
            if file_count_in_batch >= MAX_URLS_PER_SITEMAP:
                break
        
        # If the batch has any URLs, write the sitemap file
        if file_count_in_batch > 0:
            sitemap_filename = f"{SITEMAP_FILENAME_PREFIX}{sitemap_num}.xml"
            sitemap_filepath = os.path.join(OUTPUT_DIR, sitemap_filename)
            try:
                xml_bytes = prettify_xml(urlset)
                with open(sitemap_filepath, "wb") as f:
                    f.write(xml_bytes)
                print(f"Successfully created '{sitemap_filepath}' with {file_count_in_batch} URLs.")
                generated_sitemap_files.append(sitemap_filename)
                sitemap_num += 1
            except Exception as e:
                print(f"Error writing sitemap file '{sitemap_filepath}': {e}")
        else:
            # No files were processed in the last attempt, meaning the generator is exhausted
            break

    print(f"\nTotal HTML files processed: {total_files_processed}")
    if SITEMAP_ONLY_TODAY_URLS:
        print(f"Total HTML files skipped (not modified today): {total_files_skipped}")

    if not generated_sitemap_files:
        print("\nNo individual sitemaps were generated. Cannot create sitemap index.")
        return

    # --- Generate Sitemap Index ---
    index_filepath = os.path.join(OUTPUT_DIR, SITEMAP_INDEX_FILENAME)
    print(f"\nGenerating sitemap index file: {index_filepath}...")

    sitemapindex = ET.Element("sitemapindex", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
    domain_base = BASE_URL.split("/")[0] + "//" + BASE_URL.split("/")[2]

    for sitemap_file_name in generated_sitemap_files:
        try:
            sitemap_loc_url = f"{domain_base}/{sitemap_file_name}"
            
            sitemap_last_mod_w3c_for_index_entry = ""
            if FORCE_CURRENT_DATE_FOR_LASTMOD:
                sitemap_last_mod_w3c_for_index_entry = current_w3c_time_str
            else:
                individual_sitemap_full_path = os.path.join(OUTPUT_DIR, sitemap_file_name)
                sitemap_last_mod_timestamp = os.path.getmtime(individual_sitemap_full_path)
                dt_object = datetime.datetime.utcfromtimestamp(sitemap_last_mod_timestamp)
                sitemap_last_mod_w3c_for_index_entry = format_datetime_w3c(dt_object)

            sitemap_element = ET.SubElement(sitemapindex, "sitemap")
            ET.SubElement(sitemap_element, "loc").text = sitemap_loc_url
            ET.SubElement(sitemap_element, "lastmod").text = sitemap_last_mod_w3c_for_index_entry

        except Exception as e:
            print(f"  Warning: Could not process sitemap index entry for '{sitemap_file_name}': {e}")

    if len(generated_sitemap_files) > 0:
        try:
            xml_bytes_index = prettify_xml(sitemapindex)
            with open(index_filepath, "wb") as f:
                f.write(xml_bytes_index)
            print(f"Successfully created sitemap index '{index_filepath}'.")
        except Exception as e:
            print(f"Error writing sitemap index file '{index_filepath}': {e}")
    else:
        print("Sitemap index not created as no individual sitemaps were generated.")

    print("\nSitemap generation process finished.")
    print("IMPORTANT: Upload ALL generated .xml files to the root directory of your website.")
    print(f"Then submit ONLY the sitemap index URL to search engines: {domain_base}/{SITEMAP_INDEX_FILENAME}")

if __name__ == "__main__":
    if not os.path.isdir(SOURCE_HTML_DIR):
         print("="*50)
         print("!!! PLEASE EDIT THE SCRIPT CONFIGURATION !!!")
         print(f"The SOURCE_HTML_DIR ('{SOURCE_HTML_DIR}') does not seem to be a valid directory.")
         print("Please update it to the correct path where your HTML files are located.")
         print("Also check 'OUTPUT_DIR' and 'BASE_URL'.")
         print("="*50)
    elif not BASE_URL.startswith("http"):
         print("="*50)
         print("!!! PLEASE EDIT THE SCRIPT CONFIGURATION !!!")
         print(f"The BASE_URL ('{BASE_URL}') does not look like a valid absolute URL.")
         print("It should start with http:// or https://")
         print("="*50)
    else:
        generate_sitemaps()