# -*- coding: utf-8 -*-
#!/usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import json
import xml.etree.ElementTree as ET
import re
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import html # Added for html.unescape

# Global lock for writing to files and updating shared counters.
lock = threading.Lock()

# File names for output and checkpoint.
OUTPUT_FILE = "jobs.ndjson"
CHECKPOINT_FILE = "checkpoint.txt"

def extract_urls_from_sitemap(xml_content):
    """
    Parses the XML sitemap content and extracts URLs from <loc> tags.
    Returns a list of URLs.
    """
    urls = []
    try:
        root = ET.fromstring(xml_content)
        namespace = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        for url_elem in root.findall("sm:url", namespace):
            loc_elem = url_elem.find("sm:loc", namespace)
            if loc_elem is not None and loc_elem.text:
                urls.append(loc_elem.text.strip())
    except Exception as e:
        print(f"Error parsing XML sitemap: {e}")
    return urls

def attempt_fix_and_parse_json(json_text):
    """
    Attempts to fix a common issue where the 'description' field in JobPosting
    JSON contains raw HTML, making it invalid. Then tries to parse it.
    """
    try:
        # Locate the "description" key. Case-insensitive search for robustness, then get exact key.
        desc_key_match = re.search(r'"description"\s*:', json_text, re.IGNORECASE)
        if not desc_key_match:
            print("Fix attempt: 'description' key not found.")
            return None
        
        actual_desc_key_start = desc_key_match.start(0)
        
        # Find the start of the description string value (after the first quote)
        val_start_colon = json_text.find(':', actual_desc_key_start)
        if val_start_colon == -1:
            print("Fix attempt: Colon after 'description' key not found.")
            return None
            
        val_start_quote = json_text.find('"', val_start_colon)
        if val_start_quote == -1:
            print("Fix attempt: Opening quote for 'description' value not found.")
            return None
        
        val_start_pos = val_start_quote + 1

        # Find the end of the description string value.
        # This is heuristic: looks for a closing quote followed by a comma or a closing curly brace.
        # This assumes the description content itself doesn't (problematically) contain '","' or '"}'.
        end_patterns = [m.start() for m in re.finditer(r'"\s*(,|}|\s*,)', json_text[val_start_pos:])]
        if not end_patterns:
            # Fallback: try to find the last quote before a potential end of the object or next key
            end_patterns = [m.start() for m in re.finditer(r'"', json_text[val_start_pos:])]
            if not end_patterns:
                print("Fix attempt: Could not find a plausible end for 'description' value.")
                return None
        
        val_end_pos = val_start_pos + min(end_patterns) # Relative to json_text

        raw_desc_value_content = json_text[val_start_pos:val_end_pos]

        # Clean the extracted description content
        soup_desc = BeautifulSoup(raw_desc_value_content, 'html.parser')
        cleaned_description_text = soup_desc.get_text(separator=' ', strip=True)
        cleaned_description_text = html.unescape(cleaned_description_text) # Further unescape

        # Create a valid JSON string for the cleaned description (this will add quotes and escape inner content)
        json_safe_cleaned_desc_full_string = json.dumps(cleaned_description_text)
        
        # The content part of the JSON string (without outer quotes)
        json_safe_cleaned_desc_value_content = json_safe_cleaned_desc_full_string[1:-1]

        # Reconstruct the JSON text
        # Part before the description's value content starts:
        part_before = json_text[:val_start_pos]
        # Part after the description's value content ends:
        part_after = json_text[val_end_pos:] # This should be from the closing quote

        modified_json_text = part_before + json_safe_cleaned_desc_value_content + part_after
        
        # print(f"DEBUG: Modified JSON text snippet for parsing:\n{modified_json_text[:1000]}...") # For debugging

        data = json.loads(modified_json_text)
        print("Successfully parsed JSON after attempting to fix description.")
        return data
    except Exception as e_fix:
        print(f"Failed to parse JSON even after attempting to fix description: {e_fix}")
        # print(f"DEBUG: Problematic JSON snippet (modified attempt that failed):\n{modified_json_text[:1000]}...")
        return None


def extract_job_postings(html_content): # Renamed parameter for clarity
    """
    Parses the HTML using BeautifulSoup and extracts JSON-LD blocks
    containing JobPosting schema. Attempts to fix malformed description fields.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    job_postings_data = [] # Renamed to avoid conflict
    
    scripts = soup.find_all("script", type="application/ld+json")
    if not scripts:
        print("No ld+json scripts found on this page.")
    
    for script_tag in scripts: # Renamed variable
        parsed_data = None
        try:
            json_text = script_tag.string if script_tag.string else script_tag.get_text()
            if not json_text or not json_text.strip():
                print("Empty script content; skipping script.")
                continue

            try:
                parsed_data = json.loads(json_text)
            except json.JSONDecodeError as e_json:
                error_line = e_json.lineno
                error_col = e_json.colno
                error_pos = e_json.pos
                print(f"Initial JSON parsing failed: {e_json} at line {error_line} col {error_col} (pos {error_pos}). Attempting to fix.")
                # Check if "description" is likely near the error position to be more targeted, though the fix is generic.
                parsed_data = attempt_fix_and_parse_json(json_text)
            
            if not parsed_data:
                continue # Skip if parsing (and fixing) failed

            # Process the parsed_data (which can be a dict or a list)
            items_to_check = []
            if isinstance(parsed_data, list):
                items_to_check.extend(parsed_data)
            elif isinstance(parsed_data, dict):
                items_to_check.append(parsed_data)
            
            for item in items_to_check:
                if isinstance(item, dict):
                    if item.get("@type") == "JobPosting":
                        job_postings_data.append(item)
                    elif "@graph" in item and isinstance(item["@graph"], list):
                        for graph_item in item["@graph"]:
                            if isinstance(graph_item, dict) and graph_item.get("@type") == "JobPosting":
                                job_postings_data.append(graph_item)
                                
        except Exception as e_outer:
            # Catch any other unexpected errors during script processing
            print(f"Error processing a script tag: {e_outer}")
            continue
            
    return job_postings_data

def extract_salary(base_salary):
    """
    Extracts a salary string from the baseSalary field.
    Returns a formatted salary string (with unit) only if numeric
    values are found. Otherwise returns an empty string.
    """
    salary = ""
    if isinstance(base_salary, dict):
        value = base_salary.get("value")
        # Check if a string contains any digit
        contains_digits = lambda s: bool(re.search(r'\d', str(s))) if s is not None else False

        if isinstance(value, dict):
            min_val = value.get("minValue")
            max_val = value.get("maxValue")
            single_val = value.get("value") # This could be numeric or text like "Not Disclosed"
            unit = value.get("unitText", "").lower()

            if min_val is not None and max_val is not None: # Range
                if contains_digits(min_val) and contains_digits(max_val):
                    salary = f"${min_val}-${max_val} {unit}"
            elif single_val is not None: # Single value
                if contains_digits(single_val): # if value itself is numeric
                     salary = f"${single_val} {unit}"
                elif isinstance(single_val, str) and single_val != "Not Disclosed by Recruiter": # if it's a string but not the placeholder
                     salary = str(single_val) # Use the string as is if it's not the placeholder
            elif contains_digits(value): # Fallback for simple value structure, if value is directly numeric
                salary = f"${value}"
        # If 'value' is not a dict, but base_salary itself might have a simple value
        elif contains_digits(value):
            salary = str(value) # Could be base_salary.get("value") is directly a number/string
    # If base_salary is just a string/number itself (less common for schema.org MonetaryAmount)
    elif base_salary and contains_digits(base_salary):
        salary = str(base_salary)
        
    return salary.strip()

def extract_fields(job_posting):
    """
    Extracts the desired fields
    """
    job_title = job_posting.get("title") or job_posting.get("jobTitle", "")
    description = job_posting.get("description", "") # This will now be the cleaned description
    body_parts = [description] if description else []
    
    location = ""
    job_location = job_posting.get("jobLocation")
    if job_location:
        loc_item = job_location[0] if isinstance(job_location, list) else job_location
        if isinstance(loc_item, dict):
            address = loc_item.get("address", {})
            if isinstance(address, dict): # Ensure address is a dictionary
                location_parts = []
                if address.get("addressLocality"):
                    location_parts.append(address.get("addressLocality"))
                if address.get("addressRegion"):
                    location_parts.append(address.get("addressRegion"))
                if address.get("addressCountry"):
                    location_parts.append(address.get("addressCountry"))
                location = ", ".join(filter(None, location_parts))
            elif isinstance(address, str) : # if address is just a string
                location = address

    job_details = []
    base_salary = job_posting.get("baseSalary")
    salary = ""
    if base_salary:
        salary = extract_salary(base_salary)
        if salary:
            job_details.append(salary)
    
    employment_type = job_posting.get("employmentType")
    if employment_type:
        if isinstance(employment_type, list):
            emp_types = ", ".join([str(t).lower().title() for t in employment_type])
        else:
            emp_types = str(employment_type).lower().title()
        job_details.append(emp_types)
    
    return {
        "job_title": job_title.strip() if job_title else "",
        "body_parts": body_parts,
        "location": location.strip(),
        "job_details": job_details
    }

def process_job_page(url, headers):
    """
    Fetches the URL content and extracts job posting data dictionaries if found.
    Returns a list of extracted jobs.
    """
    try:
        print(f"Fetching job page: {url}")
        response = requests.get(url, headers=headers, timeout=30) # Added timeout
        if response.status_code != 200:
            print(f"Failed to fetch {url}: HTTP {response.status_code}")
            return []
        
        # It's good practice to ensure content is decoded correctly
        response.encoding = response.apparent_encoding # Guess encoding from content
        html_text = response.text

        job_posting_schemas = extract_job_postings(html_text) # Changed variable name
        extracted_jobs = []

        if not job_posting_schemas:
            print(f"No JobPosting schema successfully parsed in {url}.")
        for jp_schema in job_posting_schemas: # Changed variable name
            data = extract_fields(jp_schema)
            extracted_jobs.append(data)
        return extracted_jobs
    except requests.exceptions.RequestException as e_req: # More specific exception for requests
        print(f"Request error processing job page {url}: {e_req}")
        return []
    except Exception as e:
        print(f"Generic error processing job page {url}: {e}")
        return []

def save_checkpoint(url):
    """
    Appends a processed URL to the checkpoint file.
    """
    with lock:
        with open(CHECKPOINT_FILE, "a", encoding="utf-8") as f:
            f.write(url + "\n")

def save_job_data(job_data):
    """
    Appends the JSON job data to the NDJSON output file.
    """
    with lock:
        with open(OUTPUT_FILE, "a", encoding="utf-8") as outf:
            outf.write(json.dumps(job_data) + "\n")

def load_checkpoint():
    """
    Reads the checkpoint file and returns a set of processed URLs.
    """
    processed = set()
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r", encoding="utf-8") as f:
            for line in f:
                url = line.strip()
                if url:
                    processed.add(url)
    return processed

def process_and_checkpoint(url, headers, counters, total):
    """
    Processes a single job URL:
      - Fetches and extracts job postings.
      - Writes them out and checkpoints the URL.
      - Updates the shared progress counter.
    """
    results = process_job_page(url, headers)
    for job_data in results:
        save_job_data(job_data)
    # Record the URL as processed.
    save_checkpoint(url)
    
    with lock:
        counters["processed"] += 1
        print(f"Progress: {counters['processed']} of {total} URLs processed ({url}).")

def main():
    all_job_urls = []
    
    sitemap_list_file = "sitemaps.txt"
    if not os.path.exists(sitemap_list_file):
        print(f"Error: Sitemap list file '{sitemap_list_file}' not found.")
        return

    try:
        with open(sitemap_list_file, "r", encoding="utf-8") as f:
            sitemap_urls = [line.strip() for line in f if line.strip()]
    except Exception as e:
        print(f"Error reading {sitemap_list_file}: {e}")
        return

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " \
                      "(KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36" # Updated User-Agent
    }
    
    # Process each sitemap URL.
    for sitemap_url in sitemap_urls:
        print(f"\nProcessing sitemap: {sitemap_url}")
        try:
            response = requests.get(sitemap_url, headers=headers, timeout=30) # Added timeout
            if response.status_code != 200:
                print(f"Failed to fetch sitemap {sitemap_url}: HTTP {response.status_code}")
                continue
            
            response.encoding = response.apparent_encoding # Guess encoding
            sitemap_content = response.text

            if sitemap_content.lstrip().startswith("<?xml") or "<urlset" in sitemap_content.lower() or "<sitemapindex" in sitemap_content.lower() :
                job_urls_from_sitemap = extract_urls_from_sitemap(sitemap_content)
                if job_urls_from_sitemap:
                    print(f"Found {len(job_urls_from_sitemap)} URLs in sitemap {sitemap_url}")
                    all_job_urls.extend(job_urls_from_sitemap)
                else:
                    print(f"No URLs extracted from sitemap {sitemap_url}.")
            else:
                # If it doesn't look like XML, assume it's a direct list of URLs or a single job URL
                print(f"Sitemap {sitemap_url} does not appear to be XML. Treating as a direct URL or list.")
                # This part might need refinement based on actual content of non-XML sitemaps
                # For now, just adding the URL itself if it's not XML, or splitting if it's multiline text
                potential_urls = [u.strip() for u in sitemap_content.splitlines() if u.strip().startswith("http")]
                if potential_urls:
                     all_job_urls.extend(potential_urls)
                elif sitemap_url.startswith("http"): # If the sitemap URL itself is a job page
                     all_job_urls.append(sitemap_url)
        except requests.exceptions.RequestException as e_req:
            print(f"Request error processing sitemap {sitemap_url}: {e_req}")
        except Exception as e:
            print(f"Generic error processing sitemap {sitemap_url}: {e}")
    
    # Remove duplicates and load checkpoint.
    all_job_urls = sorted(list(set(all_job_urls))) # Sorted for consistent processing order
    processed_urls = load_checkpoint()
    remaining_urls = [url for url in all_job_urls if url not in processed_urls]
    total_remaining = len(remaining_urls) # Renamed for clarity
    total_all = len(all_job_urls) # Renamed for clarity
    print(f"\nTotal unique job URLs found: {total_all}")
    print(f"Total job URLs to process: {total_remaining} (skipping {total_all - total_remaining} already processed URLs)")

    # Shared counter for progress.
    counters = {"processed": 0}

    # Use ThreadPoolExecutor for concurrent processing.
    max_workers = 10  # Adjust based on your system/network capabilities.
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for url in remaining_urls:
            futures.append(executor.submit(process_and_checkpoint, url, headers, counters, total_remaining)) # Pass total_remaining
        
        # Process futures as they complete
        for future_item in as_completed(futures): # Renamed variable
            try:
                future_item.result() # Retrieve result or raise exception if one occurred
            except Exception as e_thread: # More specific name
                # Error already printed in process_and_checkpoint or process_job_page,
                # but can log future-specific issues here if needed.
                print(f"Error in thread execution: {e_thread}") 
    
    print(f"\nFinished. Extracted job postings have been appended to {OUTPUT_FILE}")

if __name__ == "__main__":
    main()