<?php

/****
 * The attached are two files:
1) Given names in KB without Arabs
2) Arab Given names in KB
3) Given name in KB captions without Arabs
4) Given Names nc + Captions from Given name frequency table
5) The php file (in txt format) according to the API with all the names in (1) in it to try to run
6) The php file (in txt format) according to the API with all the names (5) in it to try to run
There are probably duplicates between 5 and 6 because theoretically all the given names in the table should have instances in the KB

The task:
1) Try to run the API of Forbears to get all the given names in the KB  (1) using (5)
2) try to run the API of Forbears to get all the given names in the KB  (3) using (6)
3) If possible to merge the two in order to prevent duplication.

The output is in the form:
    [incidence] => 5470092
    [percent] => 1.507098721
    [ratio] => 66
    [rank] => 1
    [jurisdiction] => United States
    [iso] => us
    
We have to get it in the format of the surnames.tsv 
(instance, country (the IntuView instance of the country), occurrence (incidence), rank, percent
 */

require_once '../../write_log.php';

class Forbears{
    private $api_key = "";
    private $fields_conv = array("incidence" => "occurrence",
                                "name" => "instance",
                                "jurisdiction" => "country"
    );
    private $skip_fields = array("iso", "ratio");
    private $countries_conv = [];
    private $given_names_conv = [];
    private $family_names_conv = [];
    
    private $order_col = array('instance', 'country', 'occurrence', 'rank', 'percent');
   
    function __construct($key){
        $this->api_key = $key;
        
        $given_names_data = file_get_contents('input_files'.DIRECTORY_SEPARATOR.'given_names_conv.txt');
        $this->given_names_conv = array_map('trim', explode("\n", $given_names_data));
        
        $family_names_data = file_get_contents('input_files'.DIRECTORY_SEPARATOR.'family_names_conv.txt');
        $this->family_names_conv = array_map('trim', explode("\n", $family_names_data));
        
        $countries_data = file_get_contents('input_files'.DIRECTORY_SEPARATOR.'places_conv.txt');
        $countires_arr = explode("\n", $countries_data);
        
        foreach ($countires_arr as $country_instant){
            if(empty($country_instant) || strpos($country_instant, "<<<") !== false){
                continue;
            }
            $space_explode = preg_split('/\s+/', $country_instant);
            $space_explode = array_filter($space_explode);
            $str1 = implode(' ', array_slice($space_explode, 0, count($space_explode) - 1));
            $str2 = $space_explode[count($space_explode) -1];
            $this->countries_conv[trim($str1)] = trim($str2);
        }
        $this->countries_conv = array_filter($this->countries_conv);
        
    }
    

    
    function fetchNamesData($src, $output_path = "output_files"){
        error_reporting( E_ALL );
        ini_set( 'display_errors', 1 );
        setlocale( LC_CTYPE, 'en_US.UTF-8' );
        mb_regex_encoding( 'UTF-8' );
        
        $j = json_encode( $src );
        
        $url = 'https://ono.4b.rs/v1/nats?key=';
        
        $c = curl_init();
        curl_setopt( $c, CURLOPT_URL, $url . $this->api_key );
        curl_setopt( $c, CURLOPT_POST, 1 );
        curl_setopt( $c, CURLOPT_POSTFIELDS, 'names=' . $j );
        curl_setopt( $c, CURLOPT_RETURNTRANSFER, true );
        $out = curl_exec( $c );
        curl_close( $c );
        
        $nl = ( php_sapi_name() == 'cli' ) ? "\n" : '<br />';
        $j = json_decode( $out );
        
        $final_output = []; 
        $no_output = [];
        
        
        $nl = ( php_sapi_name() == 'cli' ) ? "\n" : '<br />';
        $j = json_decode( $out );
        foreach( $j->results as $person )
        {
            if( isset( $person->id ) )
                echo '(' . $person->id . ') ';
                echo $person->forename . ' ' . mb_strtoupper( $person->surname );
                if( isset( $person->secondSurname ) && $person->secondSurname )
                    echo ' ' . mb_strtoupper( $person->secondSurname );
                    echo $nl;
                    if( isset( $person->countries[0] ) )
                    {
                        echo 'Most likely from: ';
                        echo $person->countries[0]->jurisdiction . $nl;
                    }
                    if( isset( $person->countries[1] ) )
                    {
                        echo '2nd most likely from: ';
                        echo $person->countries[1]->jurisdiction . $nl;
                    }
                    if( !isset( $person->countries[0] ) )
                        echo 'No data found' . $nl;
                        echo  '------------------------------------------------------------------------------------' . $nl;
        }
        
        $final_json = json_encode($final_output, true);
        
        //write the full JSON to the FS
        $fp = fopen($output_path.DIRECTORY_SEPARATOR.time().".json","wb");
        fwrite($fp,$final_json);
        fclose($fp);
        
        //write the no ouput names in
        $fp = fopen($output_path.DIRECTORY_SEPARATOR."no_output.txt","wb");
        fwrite($fp,json_encode($no_output, true));
        fclose($fp);
        
    }
    
    function fetchJursData($src, $name_type, $output_path = "output_files"){
        error_reporting( E_ALL );
        ini_set( 'display_errors', 1 );
        setlocale( LC_CTYPE, 'en_US.UTF-8' );
        mb_regex_encoding( 'UTF-8' );

        $j = json_encode( $src );
        
        $url = 'https://ono.4b.rs/v2/jurs?key=';
        $c = curl_init();
        curl_setopt( $c, CURLOPT_URL, $url . $this->api_key );
        curl_setopt( $c, CURLOPT_POST, 1 );
        curl_setopt( $c, CURLOPT_POSTFIELDS, 'names=' . $j );
        curl_setopt( $c, CURLOPT_RETURNTRANSFER, true );
        $out = curl_exec( $c );
        curl_close ( $c );
        
        $nl = ( php_sapi_name() == 'cli' ) ? "\n" : '<br />';
        $j = json_decode( $out );
        
        $final_output = [];
        $no_output = [];
        
       
        foreach( $j->results as $person_output )
        {
  
            $person_name = $person_output->name; //needs to fetch the instance (nc-ahmad)
            if(!property_exists($person_output,'jurisdictions')){
                $no_output[] = $person_output->name;        
                continue;
            }
        }
        $final_json = json_encode($j->results, true);
        
        //write the full JSON to the FS
        $outputfile = $output_path.DIRECTORY_SEPARATOR.time().".json";
        $fp = fopen($outputfile,"wb");
        fwrite($fp,$final_json);
        fclose($fp);
        
        //write the no ouput names in
        $fp = fopen($output_path.DIRECTORY_SEPARATOR."no_output_".time().".csv","a+");
        foreach($no_output as $missing_name){
            fputcsv($fp, [$missing_name]);
        }
        fclose($fp);
        return $outputfile;
        
    }
    
    function createTSV($output_path){
        if ($handle = opendir($output_path)) {
            
            $fp = fopen('./results.tsv', 'w');
            $i = 0;
            $non = [];
            while (false !== ($entry = readdir($handle))) {
                if ($entry != "." && $entry != "..") {
                    $data = file_get_contents($output_path.DIRECTORY_SEPARATOR.$entry);
                    $arr = json_decode($data, true);
                    $i++;
                    if(!empty($arr) && is_array($arr))
                        foreach($arr as $inner_element){
                            if(array_key_exists("jurisdictions", $inner_element)){
                                foreach($inner_element["jurisdictions"] as $jur){
                                    $country = str_replace('"', "", $jur["jurisdiction"]);
                                    $temp = [$inner_element["name"], $country,  $jur["incidence"], $jur["rank"]];
                                    fputcsv($fp, $temp, "\t");
                                }
                            }
                    }
                }
            }
            
            
            fclose($fp);
            
            closedir($handle);
        }
    }
    
    
    function runOnList2($path, $type, $name_type, $output_path){
        if(!is_dir($output_path)){
            mkdir($output_path, 0777);
        } 
        $counter = 0;
        $counter_non_utf8 = 0; 
        $list_names = [];

        $counter = 0;
        $temp_arr = [];
        $file = fopen($path, 'r');
        while (($line = fgetcsv($file,1000, "\n")) !== FALSE) {
            if(!preg_match('//u', $line[0])){
                $counter_non_utf8++;
                continue;
            }
            $name = $line[0];
 
            if($counter > 0 && $counter % 1000 == 0){
                if($type=="names"){
                    $this->fetchNamesData($temp_arr, $output_path);
                    
                }
                elseif($type == "jurs"){
                    $this->fetchJursData($temp_arr, $name_type, $output_path);
                }
                else{
                    die();
                }
                $temp_arr = [];
            }
            $temp_arr[] =   ["name"=>$name, "type"=>$name_type, "id"=>$counter];// $name;
            $counter++;
        }
        if(!empty($temp_arr)){
            if($type=="names"){
                $this->fetchNamesData($temp_arr, $output_path);
                
            }
            elseif($type == "jurs"){
                $this->fetchJursData($temp_arr, $name_type, $output_path);
            }
            else{
                die();
            }
        }
    }
    
    function mergeOutputFiles($output_path){
        $f = scandir($output_path);
        foreach($f as $file) {
            if(in_array($file, array('.','..'))){
                continue;
            }
            $data = json_decode(file_get_contents($output_path.DIRECTORY_SEPARATOR.$file));
            if(empty($data)){
                continue;
            }
            $arr[] = $data;
        }
        $final_output = json_encode($arr, true);
        
        if ($final_output){
            //write the full JSON to the FS
            $fp = fopen($output_path.DIRECTORY_SEPARATOR."final.json","wb");
            fwrite($fp,$final_output);
            fclose($fp);
        }
        else
            echo json_last_error_msg();
    }

    function removeDuplicates($big_list, $small_list){
        $big_data = json_decode(file_get_contents($big_list),true);
        $small_data = json_decode(file_get_contents($small_list),true);
        #TODO: complete
    } 
    
    function write_tabbed_file($filepath, $folder, $save_keys=false){
        $content = '';
        
       $counter = 0;
       if ($handle = opendir($folder)) {
           
           while (false !== ($entry = readdir($handle))) {
               $counter++;
               write_to_log("TRACE", "Counter is $counter");
               
               $big_array = [];
               if ($entry != "." && $entry != "..") {
                   $big_array = array_merge(json_decode(file_get_contents($folder.DIRECTORY_SEPARATOR.$entry,true)));
                   if(empty($big_array)){
                       return;
                   }
                   if ($save_keys){ 
                       $keys = array_keys((array)$big_array[0]);
                       $content .= implode("\t", $keys)."\n";
                   }
                   foreach ($big_array as $element){
                       while(list($key, $val) = each($element)){
                           // replace tabs in keys and values to [space]
                           $key = str_replace("\t", " ", $key);
                           $val = str_replace("\t", " ", $val);
                           
                           
                           $content .=  $val . "\t";
                           
                       }
                       $content .= "\n";
                       
                       
                   }
               }
           }
           
           closedir($handle);
       }
        
        
        if (file_exists($filepath) && !is_writeable($filepath)){
            return false;
        }
        if ($fp = fopen($filepath, 'w+')){
            fwrite($fp, $content);
            fclose($fp);
        }
        else { return false; }
        return true;
    }

    /**
     * use all_gives.txt to find caption and match the instance - and replace it in the tsv
     * @param unknown $file
     * @param unknown $tsv_file
     * @return boolean
     */
    function fix_instance_caption($input_file, $all_given, $output_dest){
        $delimiter = "\t";
        $first = true;
        $fp = fopen($input_file, 'r');
        
        while ( !feof($fp) )
        {
            if($first){
                $first = false;
                continue;
            }
            $line = fgets($fp, 2048);
            
            $data = str_getcsv($line, $delimiter);
            $data_arr[trim(strtolower($data[1]))] = trim(strtolower($data[0]));
            
        }
        
        $fp = fopen($all_given, 'r');
        $first = true;
        while ( !feof($fp) )
        {
            $line = fgets($fp, 2048);
            
            
            $data_tsv = str_getcsv($line, $delimiter);
            $data_tsv[0] = trim(strtolower($data_tsv[0]));
            if($first){
                $first = false;
                $final_arr[] = $data_tsv;
                 continue;
            }

            
            if(strpos($data_tsv[0], 'nc-') !== false){
                $final_arr[] = $data_tsv;
            }
            elseif(!empty($data_tsv[0]) && array_key_exists($data_tsv[0], $data_arr)){
                $data_tsv[0] = $data_arr[$data_tsv[0]];
                $final_arr[] = $data_tsv;
                
                }
                else{
                    $final_arr[] = $data_tsv;
                    
            }
        }
            
            
            $content = "";
            $keys = $final_arr[0];
            $content .= implode("\t", $keys)."\n";
            $first = true;
            foreach ($final_arr as $inner_array){
                if($first){
                    $first = false;
                    continue;
                }
                while(list($key, $val) = each($inner_array)){
                    $key = str_replace("\t", " ", $key);
                    $val = str_replace("\t", " ", $val);
                    $content .=  $val . "\t";
                    
                }
                $content .= "\n";
                    
                
            }
            if (file_exists($output_dest) && !is_writeable($output_dest)){
                return false;
            }
            if ($fp = fopen($output_dest, 'w+')){
                fwrite($fp, $content);
                fclose($fp);
            }
            else { return false; }
            return true;
            
            
        }
        
        
        
      /***
       * function that find all the names without nc nc-fn nc-gn
       * @param unknown $tsv_file
       * @return boolean
       */
        function count_not_nc($tsv_file, $output_dest){
            $delimiter = "\t";
            
            $final = [];
            $fp = fopen($tsv_file, 'r');
            $first = true;
            while ( !feof($fp) )
            {
             
                $line = fgets($fp, 2048);
                
                if($first){
                    $first = false;
                    continue;
                }
                
                $data_tsv = str_getcsv($line, $delimiter);
                
                if(strpos($data_tsv[0] , 'nc-') === false){
                    $final[] = $data_tsv[0];
                }
                
            }
            if (file_exists('./missing.txt') && !is_writeable('./missing.tsv')){
                return false;
            }
            $final = array_map('utf8_encode', $final);
            
            $data = json_encode($final, true);
            if ($fp = fopen($output_dest, 'w+')){
                fwrite($fp, $data);
                fclose($fp);
            }
    }
    
    /**
     * find non nc- names in TSV file
     * @param unknown $TSV_file
     */
    function find_non_nc($TSV_file){
        $file = fopen($TSV_file, 'r');
        $names_arr = [];
        $i = 0;
        while (($data = fgetcsv($file, 1000, "\t")) !== FALSE){
            $name = $data[0];
            if ( false === strpos( $name, "nc-" ) ){
                if(!in_array($name, $names_arr)){
                    $names_arr[] = $name;
                }
            }
        }
        
        fclose($file);
        $fp = fopen("./non_nc.tsv", 'w+');
        
        foreach ($names_arr as $result) {
            fputcsv($fp,[$result],"\t");
            
        }
    }
    
    /**
     * patch to fix TSV_file file with nc- instances from nc_file
     * @param unknown $nc_file
     * @param unknown $TSV_file
     */
    function fixTSVnc($all_file, $TSV_file){
        ini_set('memory_limit', '10048M');
      
        /**
         * get the given names & nc from KB to full array (to find in TSV non nc- ones to update/fix)
         * @var array $change_arr
         */  
        $change_arr = [];
        $file = fopen($all_file, 'r');
        $i = 0;
        while (($data = fgetcsv($file, 1000, "\t")) !== FALSE){
            $name = str_replace('s:n', '', trim($data[1], '()'));
            $name_base64 = base64_encode(strtolower($name));
            $change_arr[$name_base64] = $data[0];
        }  
        fclose($file);
        
        /**
         * go over the TSV output and fix non nc- ones
         * @var string $TSV_file
         */
        $file = fopen($TSV_file, 'r');
        $i = 0;
        $list_names = [];
        
        while (($data = fgetcsv($file, 1000, "\t")) !== FALSE){
            $i++;
            $name = $data[0];
            
            $name_base64 = base64_encode(strtolower($name));
            if ( false === strpos( $name, "nc-" ) ){
                if(array_key_exists($name_base64, $change_arr)){
                    $data[0] = $change_arr[$name_base64];
                }
            }
            
            $list_names[] = $data;
            
        }
        
        fclose($file);
        
        /**
         * write to fixed_new output
         * @var Ambiguous $fp
         */
        $fp = fopen("./FIXED_NEW_merged.tsv", 'w+');
        
        foreach ($list_names as $result) {
            fputcsv($fp,$result,"\t");  
        }       
    } 
    
    function merge_person_countires($path){
        ini_set('memory_limit', '11048M');
        
        /**
         * get the given names & nc from KB to full array (to find in TSV non nc- ones to update/fix)
         * @var array $change_arr
         */
        $results = [];
        $countries = [];
        
        $file = fopen($path, 'r');
        $i = 0;

        while (($data = fgetcsv($file, 1000, "\t")) !== FALSE){
            $i++;
            $name = ($data[0]);
            $country = ($data[1]);
            $occ = $data[2];
            $perc =  $data[3];
            //results
            if(isset($results[$name][$country])){
                $i++;
                $results[$name][$country]["occ"] += $occ;
                $results[$name][$country]["perc"] = bcadd($results[$name][$country]["perc"], $perc,10);
            } 
            else{
                $results[$name][$country] = ["occ"=>$occ, "perc"=>$perc];
            }
//             if($i > 100){
//                 break;
//             }
        }
        // {"india":{"occ":"61","rank":"471357","perc":"0.000005021"},"malaysia":{"occ":"1","rank":"373101","perc":"0.000003284"}} [] []
        
        fclose($file);
        
        $fp = fopen("./FIXED_merged_new.tsv", 'w+');
        $count = 0;
        $total = count($results);
        foreach($results as $name=>$arr){  
            $count++;
            write_to_log("TRACE", "count: $count/ ". $total);
            foreach($arr as $country=>$data){
                $row = [$name, $country, $data["occ"], $data["perc"]];
                fputcsv($fp,$row,"\t");   
            }
        }
       
     
        fclose($fp);
    }
    
    /**
     * create tsv file from jsons files in $output_path
     * @param unknown $output_path
     */

    /**
     * Standard Deviation - we need the names with the HIGHEST standard deviation (with the straightest line). This is calculated for each name: AR (Average Rank)=(RankA+RankB + RankN)/N - where Rank is the rank of the name in each country
            Then SD of the name is = square root of 1/N*([RankA-AR]squared + ([RankB-AR]square + + ([RankN-AR]squared 
            Then we order the names according to their SD - the ones with the highest are the ones with the widest prevalence as high rank names in a large number of countries. This should return names like David, Daniel etc. But not names that are prevalent to a certain language group but not in others (Pablo, Jose, Juan)
            Example:
            Names A, B,  in countries F, G, H, J
            A has ranks 50, 100, 1000, 7,000 - AR=2037.5, SD=2889.93
            B has ranks 50, 49, 51, 52 AR=50.2 SD= 1.1576
            So B is far more prevalent and non-ethnicity specific than A
     */
    function calcNamesStats($path){
        if (($handle = fopen($path, 'r')) === false) {
            die(print_r(error_get_last(),true));
        }
        
        $names = [];
        $first = true;
        
        $i = 0;
        
        $AR_arr = $total_arr = $final_res = [];
        
        while ($row = fgetcsv($handle, 1024, "\t")) {
            $i++;
            if($i > 1000){
                break;
            }
            if($first){
                $first = false;
                continue;
            }
            $name = $row[0];
            $rank = $row[2];
            
            $total_arr[$name]["total"] += $rank;
            $total_arr[$name]["count"]++;
        }
        
        fclose($handle);
        
        foreach($total_arr as $name=>$element){
            $AR_arr[$name] = $element["total"]/$element["count"];
        }
        
        //SD of the name is = square root of 1/N*([RankA-AR]squared + ([RankB-AR]square + + ([RankN-AR]squared
        if (($handle = fopen($path, 'r')) === false) {
            die(print_r(error_get_last(),true));
        }
        $temp_results = [];
        $first = true;
        $i = 0;
        while ($row = fgetcsv($handle, 1024, "\t")) {
            if($first){
                $first = false;
                continue;
            }
            $i++;
            if($i > 1000){
                break;
            }
            $name = $row[0];
            $rank = $row[2];
            $curr_AR = $AR_arr[$name];
            $calc = $rank-$curr_AR;
            $temp_results[$name] += pow(2, $calc);
        }
        
        foreach ($temp_results as $name=>$element){
            $count = $total_arr[$name]["count"];
            
            $final_res[$name] = $temp_results[$name]*$count;
            $final_res[$name] = sqrt(1 / $final_res[$name]);
        }
        
        return $final_res;
        
        
    }
}


