<?php

//============================================================================
//===============================NAME ANALYSER FUNCTION=======================
//============================================================================
require_once 'eth_check.php';


function getInputOverride($name){
    $forcedGroup = "";
    if(preg_match('/\p{Arabic}/u', $name)){
        $forcedGroup = 'arab';
    }
    elseif(preg_match( '/[\p{Cyrillic}]/u', $name)){
        $forcedGroup = 'slavic';
    }
    elseif(preg_match("/\p{Han}+/u", $name)){
        $forcedGroup = 'chinese';
    }
    if(!empty($forcedGroup)){
        write_to_log("INFO", "forced group $forcedGroup for $name");
    }
    $out =    "<ConfigOverrides><item path=\"/Configuration/Execution/engines/engine[@name='IVNlp']/modules/module[@name='mainChartParser']/nameAnalyzer/groups/@forcedGroup\" value=\"$forcedGroup\"/>" . 
        "</ConfigOverrides>";
    
    return $out;
}
/**
 * fetchPersonData - connect directly to the NA (wihtout EM) for analyseName info
 * @param unknown $ip
 * @param unknown $name
 * @return boolean|array|string|array[]|string[]
 */
function fetchPersonData($ip, $name, $reverseOrder, $fromBackground = false){
    error_reporting(E_ALL);
    ini_set('display_errors', '1');
    
//     $out = "";
//     $out = "<IVEnvelope><Request id=\"3\"><GetAnalysis><InputParameters><ResultType value=\"xml\"/><Name>";
//     $out .= "<value>$name</value></Name></InputParameters></GetAnalysis></Request></IVEnvelope>";
    $out = "";
    $out .= "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
    
    $out .= "<IVEnvelope><Request id=\"3\"><AnalyzeName>";
    
    //hack for input - only if from NA 
    if(!$fromBackground){
        $out .= getInputOverride($name);
    }
    $out .="<ResultType value=\"xml\"/><Name>$name</Name></AnalyzeName></Request></IVEnvelope>";

    $socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);
    
    if ($socket === false)
    {
        $errorcode = socket_last_error();
        $errormsg = socket_strerror($errorcode);
        
        return false;
    }
    
    $result = socket_connect($socket, $ip, 6001);
    
    $result = socket_write($socket, $out, strlen($out));
    if ($result === false)
    {
        $errorcode = socket_last_error();
        $errormsg = socket_strerror($errorcode);
        return false;
    }
    
    $rval = socket_set_option($socket,SOL_SOCKET, SO_RCVTIMEO, array("sec"=>30, "usec"=>0));
    $output="";
    $numRetries = 0;
    while ($numRetries++ < 500){
        set_time_limit(40); //extend the time limit
        $read = socket_read($socket, 1024);
        if ($read === false)
        {

            $errorcode = socket_last_error($socket);
            if ($errorcode === SOCKET_EMSGSIZE || $errorcode === 10060) {
                continue;
            }
                
            $errormsg = socket_strerror($errorcode);
            return false;
        }
        $output = $output . $read;
        
        $ending = strpos($output, "</IVEnvelope>");
        if ($ending !== false){ //|| strlen($output) == 0)
            
            break;
        }
    }
    
    socket_close($socket);
 
//    $simpleXml = checkEncodingAndOutput($output);
   $simpleXml = simplexml_load_string($output);
   
    if (($simpleXml) === false) {
        $error = libxml_get_last_error();
        write_to_log("ERROR", "na xml error for $name : ".$error);
        // Handle or log the error
        libxml_clear_errors();
        return;
    }
    
    $res = [];
    
    
    $analysis = $simpleXml->Reply->AnalyzeName->Analysis;
    $res['caption'] = (string)$analysis->caption;
    $res['ethnicity'] = (string)$analysis->ethnicity;
    $nameAlternatives = $analysis->name_alternatives;
    $res['Name Alternatives'] = array();
//     write_to_log("ERROR",$output);
    $i = 0;
    if(!empty($nameAlternatives) && $nameAlternatives->count() > 0){
        foreach ($nameAlternatives as $nameAlternative)
        {
            $res['Name Alternatives'][$i] = array();
            foreach ($nameAlternatives->name_alternative as $key => $value)
            {
                $skip = false;
                foreach ($value as $element) {
                    $tag = str_replace("-", "_", $element->getName());
                    if($tag == 'unknown_name_role'){
                        $res['Name Alternatives'][$i]['warning'] =  "No Analysis";
                        $res['Name Alternatives'][$i]['caption'] = $name;
                    }
                    $attributes =  ($element->attributes());
                    
                    if($tag == "naming_convention" && strpos(trim($element),"Reverse Order") !== false && !$reverseOrder){
                        unset($res['Name Alternatives'][$i]);
                        $skip = true;
                        break;
                    }
                 
                    if(!empty($attributes["instance"]) && !in_array($tag,["gender", "ethnicity","naming_convention"])){
                        if(empty($element)){
                            write_to_log("ERROR", "empty caption in xml response - $tag");
                            continue;
                        }
                        $res['Name Alternatives'][$i][$tag]["caption"] = trim($element);
                        $inst_arr = getInstancesFromAttr($attributes["instance"]);      
                        $res['Name Alternatives'][$i][$tag]["instance_arr"] = $inst_arr;
                        
                        //if it empty instances array from attributes (if its dynamic instances) - just place the caption
                        if(empty($inst_arr)){
                            $res['Name Alternatives'][$i][$tag] =  trim($element);     
                        }
                    }
                    else{
                        $elem_str = trim($element);
                        if($tag == 'transliteration_ic'){
                            $tag = 'caption_ic';
                        }
                        $res['Name Alternatives'][$i][$tag] =  $elem_str;
                    }
                       
                }
                if($skip){
                    $skip = false;
                    continue;
                }
//                 addCaptionIC($res["Name Alternatives"][$i], (string)$analysis->{'transliteration-ic'} ?? null);
                $res['Name Alternatives'][getNADigestKey($res['Name Alternatives'][$i])] = $res['Name Alternatives'][$i];
                unset($res['Name Alternatives'][$i]);
                $i++;
            }
           
        }
    }
    $res['Name Alternatives'] = array_values($res['Name Alternatives']);
    
    //PATCHES - check name alternative eth->nc
    $res_ret = $res;//temp_patches($res, $name);
    
    if(empty($res_ret["Name Alternatives"])){
        $res_ret = $res;
    }
    return $res_ret;
}

function checkEncodingAndOutput($output){
    // Assuming $inputString contains your XML data
    $currentEncoding = mb_detect_encoding($output, 'UTF-8, ISO-8859-1, Windows-1252', true);
    $simpleXml = null;
    if ($currentEncoding !== 'UTF-8') {
        $correctlyEncodedString = mb_convert_encoding($output, 'UTF-8', $currentEncoding);
        
        // Now you can pass the correctly encoded XML string to simplexml_load_string()
        $simpleXml = simplexml_load_string($correctlyEncodedString);
    } else {
        // The input string is already UTF-8, so you can directly use simplexml_load_string()
        $simpleXml = simplexml_load_string($output);
    }
    return $simpleXml;
}

function temp_patches($res, $name){
    require_once '../allInOneWeb/sqlUtils.php';    
//     $res = literalsCheck($res, $name);   
    $res = checkEthNC($res);
        
    return $res;
}
/**
 * get the instances from the attributes in the tag
 * @param unknown $instances_str
 * @return NULL|string[]
 */
function getInstancesFromAttr($instances_str){
    $final_res = [];
    $arr = explode(',', $instances_str);
    foreach($arr as $inst){
        //if its dyamic instant - continue
        if(strpos($inst, "dynamic-inst-") !== false){
            write_to_log("ERROR", "dynamic-inst in xml - continue");         
            continue;
        }
        $final_res[] = trim($inst);
    }
    return $final_res;
}
/**
 * 
 * @param unknown $na
 */
function getNADigestKey($na){
    $digestNAKey = "";
    foreach($na as $col=>$val){
        $val = is_array($val) ? implode("^", $val["instance_arr"]) : $val;
        $digestNAKey .= createNADigestElement($col,$val);
    }
    return base64_encode(gzcompress($digestNAKey,9));;
}



function addCaptionIC(&$na, $icCaption){
    if(empty($icCaption)){
        return $na;
    }
    //  only if ethnicity is arab pakistani persian afghan afpak kurdish
    $nc = strtolower(preg_replace("/[^A-Za-z ]/", '', $na["naming_convention"] ?? ""));
    $eth = strtolower(preg_replace("/[^A-Za-z ]/", '', $na["ethnicity"] ?? ""));
    
    if(in_array($nc, ["arab","pakistani","persian","afghan","afpak","kurdish","somali"]) ||
        in_array($eth, ["arab","pakistani","persian","afghan","afpak","kurdish"])){
            $na['caption_ic'] = $icCaption;
    }
 
    return $na;
    
}

//PATCH - finalsation - check ethnicities --> Naming convention
function checkEthNC($res){
    
    global $eth_mapping;
    $org_na_arr = $res["Name Alternatives"];
    
    //count org backup arr since we remove from current array
    for($i = 0; $i <  count($org_na_arr); $i++){
        $na = $res["Name Alternatives"][$i];
        $na_nc = stripEthStr($na["naming_convention"] ?? "");
        
        if($na_nc == 'generic/undefined' && count($res["Name Alternatives"]) > 1){
            unset($res["Name Alternatives"][$i]);
            $res["Name Alternatives"] = array_values($res["Name Alternatives"]);
            continue;
        }
        if(!array_key_exists("ethnicity", $na)){
            continue;
        }
        
        $eth_arr = getEthArr($na["ethnicity"] ?? []);
        $keep_na = false;
        foreach ($eth_arr as $eth_pair){
            $eth_element = strtolower(trim($eth_pair[0]));
            $eth_element_arr = explode(',', $eth_element);
            foreach($eth_element_arr as $eth){
                $nc = $eth_mapping[$eth] ?? "";
                $map_nc = stripEthStr($nc ?? null);
                
                if(empty($map_nc) || empty($na_nc)){
                    $keep_na = true;       
                    continue;
                }
                //if one ethnicity match the nami   ng convention - keep
                if($map_nc == $na_nc){
                    $keep_na = true;
                    break;
                }
            }
            
        }
        if(!$keep_na){
            write_to_log("ERROR", "ethnicity doesnt match the naming-convention - unset");
            write_to_log("TRACE", "Caption: ".$na["caption"]. " & ethnicity: ".$na["ethnicity"]." & naming_convention: ".$na["naming_convention"]);
            unset($res["Name Alternatives"][$i]);
        }
    }
    
    
    if(empty($res["Name Alternatives"])){
        $res["Name Alternatives"] = $org_na_arr;
    }
    else{
        $res["Name Alternatives"] = array_values($res["Name Alternatives"]);
    }
    
    
    
    return $res;
}

function literalsCheck($res, $name){
    global $literals_eth, $eth_mapping, $sub_eth_arr, $eth_parent_arr;
    $prefix = "0x0"; 
    
    $allowed_eth = getAllowedEth($name, $literals_eth);
    
    $na_index = -1;
    foreach($res["Name Alternatives"] as &$na){
        $na_index++;
        if(!array_key_exists("ethnicity", $na)){
            continue;
        }
        
        
        //if no allowed_eth - means we didnt find unicode-->eth_list match - means all generic chars
        if(empty($allowed_eth)){
            continue;
        }
        
        $curr_eth_arr = getEthArr($na["ethnicity"], $allowed_eth);
        
        $eth_arr = CheckAllowedEth($allowed_eth, $curr_eth_arr, $eth_parent_arr, $sub_eth_arr);
        
        $eth_str = "";
        if(!empty($eth_arr)){
            foreach($eth_arr as $eth_element){
                $eth_str .= implode(' ', $eth_element);
            }
        }
        if(empty($eth_str)){
            unset($res["Name Alternatives"][$na_index]);
            continue;
        }
        $na["ethnicity"] = $eth_str;
    }

    return $res;
    
}

function getAllowedEth($name, $literals_eth){
    $splitted = mb_str_split($name);
    $first_empty = true;
    $allowed_eth = [];
    $found_literal = false;
    
    foreach($splitted as $char){
        $num = Unicode_decode($char);
        
        $unicode = substr($num, -3);
        
        $unicode = "0x0".$unicode;
        
        $lang_str = $literals_eth[$unicode] ?? [];
        if(empty($lang_str)){
            continue;
        }
        $found_literal = true;
        $curr_arr=  explode(';', $lang_str ?? "");
        if(empty($allowed_eth) && $first_empty){
            $first_empty = false;
            $allowed_eth = $curr_arr;
        }
        $allowed_eth = array_intersect($allowed_eth,$curr_arr);
    }
    $allowed_eth = array_filter(array_unique($allowed_eth));
    
    return $allowed_eth;
}

function Unicode_decode($text) {
    $g = iconv("UTF-8", "UCS-4BE", $text);
    $a = unpack('H*', iconv("UTF-8", "UCS-4BE", $text));
    return implode(unpack('H*', iconv("UTF-8", "UCS-4BE", $text)));
}

function CheckAllowedEth($allowed_eth, $eth_arr, $parent_arr, $sub_arr){
    $keep_arr = [];
    $keep_arr_record = []; //if we keep sub ethnicities in string - we can keep here as items to check if exists
    foreach($eth_arr as $eth){
        $eth[0] = str_replace(' ', '-', $eth[0]);
        //check if we have same eth in the allowed-eth
        if(in_array($eth[0],$allowed_eth) && !in_array($eth[0],$keep_arr_record)){
            $keep_arr[] = $eth;
            continue;
        }
        
        //check if ethnicity has sub eth in the allowed eth
        if(array_key_exists($eth[0], $sub_arr)){
            $sub_curr_arr = $sub_arr[$eth[0]];
            $inter_arr = array_intersect($sub_curr_arr, $allowed_eth);
            if(!empty($inter_arr)){
                $inter_arr_final = [];
                foreach($inter_arr as &$int_element){
                    if(!in_array($int_element,$keep_arr_record)){
                        $inter_arr_final[] = $int_element;
                    }
                }
                if(!empty($inter_arr_final)){
                    $eth_str = implode(',', $inter_arr_final);
                    $keep_arr_record = array_merge($keep_arr_record, $inter_arr_final);
                    $keep_arr[] = [$eth_str, $eth[1]];
                }
                continue;
            }
        }
        
        //check if it has parent eth in the allowed
        if(array_key_exists($eth[0], $parent_arr)  && !in_array($eth[0],$keep_arr_record)){
            $parent_curr_arr = $parent_arr[$eth[0]];
            $inter = array_intersect($parent_curr_arr, $allowed_eth);
            if(!empty($inter)){
                $keep_arr_record[] = $eth;
                $keep_arr[] = $eth;
                continue;
            }
        }
        if(!in_array($eth[0],$keep_arr_record)){
            $remove_arr[] =  $eth; 
        }
    }

    //gets the prec to add to remain elements
    if(!empty($keep_arr)){
        
        //remove literals (0%) elements if statistic ethnicites exists
        $keep_arr_temp = $keep_arr;
        $max_key = max(array_keys($keep_arr_temp));
        for($i = 0; $i <  $max_key; $i++){
            if($keep_arr_temp[$i][1] == 0){
                unset($keep_arr_temp[$i]);         
            }
            if($i+1 == max(array_keys($keep_arr_temp))){
                if($keep_arr_temp[$i+1][1] == 0){
                    unset($keep_arr_temp[$i+1]);
                    continue;
                }
            }
        }
        if(!empty($keep_arr_temp)){
            $keep_arr = $keep_arr_temp;
        }
        
        
        if(!empty($remove_arr)){
            $prec = 0;
            foreach($remove_arr as $element){
                $prec += $element[1];    
            }
            if($prec > 0){
                $prec = number_format((float)$prec / count($keep_arr), 2, '.', '');
                foreach($keep_arr as &$keep_element){
                    if($keep_element[1] < 100-$prec){
                        $keep_element[1] += $prec;
                    }
                }
            }
        }
        foreach($keep_arr as &$keep_element){  
            $keep_element[1] = " (".$keep_element[1]."%) ";
        }
    }
    
    return $keep_arr;
}

/**
 * 
 * @param unknown $eth_arr
 * @return array
 */
function getEthArr($eth_str, $allowed_eth = null){
    if(empty($eth_str)){
        return [];
    }
    $eth_str = trim($eth_str);
    $eth = preg_replace("/\([^)]+\)/","^", $eth_str);
    $eth_arr = array_filter(explode(")", $eth_str));
    $track_arr = [];
    foreach($eth_arr as $str_eth){
        $pair = explode('(', $str_eth);
        $pair[0] =  strtolower(trim($pair[0]));
        if(count($pair) < 2){
            $pair[1]  = 100;
        }
        $pair[1] = (int) filter_var($pair[1], FILTER_SANITIZE_NUMBER_INT);
        $pairs[] = $pair;
        $track_arr[] =  strtolower(trim($pair[0]));
    }
    
    if(!empty($allowed_eth)){
        foreach($allowed_eth as $eth){
            if(!in_array($eth, $track_arr)){
                $pairs[] = [$eth, 0];
            }
        }
    }
    return $pairs;
}

/**
 * stripEthStr - remove spaces and - from eth string for check
 * @param unknown $eth
 * @return mixed
 */
function stripEthStr($eth){
    $eth = strtolower(trim($eth));
    $eth = str_replace('-', ' ', $eth);
    return $eth;
}



