Script 3: statistical_analysis_segmented_clustering.m
% This script is used to conduct statistical analysis (consisting of
% K-Medoids clustering, Fisher's exact test, and leave-p-out
% cross-validation) on each of the patent indicators and patent indicator
% sets extracted for each technology using the 'extract_patent_metrics.m'
% script, which have subsequently been compiled in the 'Extracted patent
% indicators.xlsx' spreadsheet.
% This script is used to create spectrograms showing frequency content
% against time slots for each of the patent indicators extracted using the
% 'extract_patent_metrics.m' script, which have subsequently been compiled
% in the 'Extracted patent indicators.xlsx' spreadsheet.
clearvars
clear all
close all
tic()
% Request the user to choose the current working directory (NB: if no
% folder is selected in the user interface then the current working
% directory is taken to be the filepath) OR do not ask the user and just
% take the current directory:
% filepath = uigetdir;
patent_data_path = pwd;
% Identify the name of the selected working directory:
[upperPath,deepestFolder] = fileparts(patent_data_path);
% Set the number of clusters to group into:
num_clusters = 2;
% Set the number of technologies to leave-out in leave-p-out
% cross-validation:
num_technologies_to_remove = 1;
% Set the total number of patent indicators available in the excel file:
max_num_indicators = 10;
% Specify column index containing Technology Life Cycle stage data points
% (relative to order in technology_data.data structure):
TLC_stages_column_idx = 12;
% Preallocate empty cell and double arrays for storing indicator subsets:
indicator_subset = cell(1, max_num_indicators);
% Generate the indicator subsets to consider for clustering purposes:
% 1 = patents by application year
% 2 = patents by priority year
% 3 = corporates by priority year
% 4 = non-corporates by priority year
% 5 = set of inventors by priority year
% 6 = cited references by priority year
% 7 = cited patents by priority year
% 8 = distinct IPC subclass by priority year
% 9 = top 5 IPC subclass patents by priority year
% 10 = top 10 IPC subclass patents by priority year
for i = 1:max_num_indicators
    indicator_subset{i} = nchoosek(1:max_num_indicators,i);
    if i == 1
        indicator_subset_list = num2cell(indicator_subset{i},2);
    else
        indicator_subset_list = [indicator_subset_list; num2cell(indicator_subset{i},2)];
    end   
end
% Calculate the total number of possible indicator subsets:
total_num_indicator_subsets = size(indicator_subset_list,1);
% Import compiled patent indicator data from the 'Extracted patent
% indicators.xlsx' spreadsheet:
technology_data = importdata('Extracted patent indicators.xlsx');
% Extract list of technologies included:
all_technologies = fieldnames(technology_data.data);
% Set the known cluster IDs for each of the technologies included in
% the patent indicator dataset, where 1 = technology arising as a result of
% previous stagnation and 2 = technology arising from presumption (NB:
% THESE VALUES ARE BASED ON PRIOR LITERATURE EVIDENCE AND NEED TO BE
% UDPATED/RESEQUENCED IF ANY NEW TECHNOLOGIES ARE ADDED TO/REMOVED FROM THE
% PATENT INDICATOR DATASET, OTHERWISE LATER PREDICTED VS. KNOWN CLUSTERING
% COMPARISONS WILL BE MISALIGNED OR WILL NOT HAVE THE CORRECT MATRIX
% DIMENSIONS):
technology_data.known_cluster_id.all_technologies = [1;1;2;1;2;2;1;2;1;2;1;1;2;1;1;1;2;2;2;1;1;1;2;2;2;1];
% Transform, smooth, and normalise the patent indicators considered:
for i = 1:numel(all_technologies)
    % Transform the patent indicator considered:
    for j = 1:max_num_indicators
        % Select the current indicator vector to transform, smooth, and
        % normalise:
        current_indicator_data = technology_data.data.(all_technologies{i})(1:end-1,1+j);
       
        % Transform the technology data points by calculating the
        % equivalent Inverse Hyperbolic Sine values (this in itself is
        % approximately equivalent to calculating the natural logarithm of
        % the values, except that this can also handle data values equal to
        % zero):
        technology_data.transformed_data.(all_technologies{i})(:,j) = asinh(current_indicator_data);
       
        % (UPDATE: The technology data points used in statistical
        % correlation analysis and significance testing should NOT be
        % smoothed, in line with the comments and illustration provided
        % here:
        % https://stats.stackexchange.com/questions/144013/smoothing-when-to-use-it-and-when-not-to
        % As such, the smoothing command below is no longer used and the
        % transformed data is used without any further smoothing. However,
        % smoothing the data for use in forecasting IS acceptable, and as
        % such the smoothing of time series conducted as part of the
        % subsequent functional data analysis process is not a problem)
       
        % ORIGINAL COMMANDS, NOW NO LONGER USED: Smooth the technology data
        % points by calculating three-year moving averages for each of the
        % patent indicators considered:
        technology_data.smoothed_data.(all_technologies{i})(:,j) = smooth(technology_data.transformed_data.(all_technologies{i}) (:,j),3);
%         technology_data.smoothed_data.(all_technologies{i})(:,j) = smooth(current_indicator_data,3);
    end
   
    % Trim the array to remove the last few years were records have not yet
    % all been accounted for (i.e. it normally takes several years to
    % register all the patent records from a specific year, so the very end
    % years are likely to be incomplete). In these datasets, data after
    % 2011 seems to tail-off:
    trim_index = find(technology_data.data.(all_technologies{i}) (:,1) > 2011);
    technology_data.trimmed_data.(all_technologies{i}) = technology_data.transformed_data.(all_technologies{i});
    technology_data.trimmed_data.(all_technologies{i})(trim_index,:) = [];
   
    % Normalise the patent indicator considered:
    for j = 1:max_num_indicators
        % Normalise the technology data points by dividing all values by
        % the maximum recorded value for each of the patent indicators
        % considered:
        technology_data.normalised_data.(all_technologies{i})(:,j) = technology_data.trimmed_data.(all_technologies{i}) (:,j) / max(technology_data.trimmed_data.(all_technologies{i})(:,j));
%         technology_data.normalised_data.(all_technologies{i}) (:,j) = technology_data.smoothed_data.(all_technologies{i}) (:,j) / max(technology_data.smoothed_data.(all_technologies{i}) (:,j));
    end
   
    % Add normalised time and Technology Life Cycle columns back on to
    % normalised datasets:
    technology_data.normalised_data.(all_technologies{i}) = [technology_data.normalised_data.(all_technologies{i}), (0:1/ (size(technology_data.normalised_data.(all_technologies{i}), 1) - 1):1)'];
    technology_data.normalised_data.(all_technologies{i}) = [technology_data.normalised_data.(all_technologies{i}), technology_data.data.(all_technologies{i}) (1:end-(1 + size(trim_index,1)), 12:19)];
end
% Determine the most advanced technology life cycle stage reached by each
% technology:
max_TLC_stage = zeros(numel(all_technologies),1);
for i = 1:numel(all_technologies)
    max_TLC_stage(i) = max(technology_data.normalised_data.(all_technologies{i}) (:,TLC_stages_column_idx));
end
% Determine the minimum technology life cycle stage covered by each
% technology dataset:
min_TLC_stage = zeros(numel(all_technologies),1);
for i = 1:numel(all_technologies)
    min_TLC_stage(i) = min(technology_data.normalised_data.(all_technologies{i}) (:,TLC_stages_column_idx));
end
% Filter out data from technologies with insufficient data records:
technology_data_filtered = technology_data;
technology_data_filtered.known_cluster_id.filtered = technology_data.known_cluster_id.all_technologies;
% fields_to_remove = {};
technologies_with_insufficient_records = {all_technologies{1}, all_technologies{5}, all_technologies{20}};
technology_data_filtered.data = rmfield(technology_data_filtered.data, technologies_with_insufficient_records);
technology_data_filtered.textdata = rmfield(technology_data_filtered.textdata, technologies_with_insufficient_records);
technology_data_filtered.colheaders = rmfield(technology_data_filtered.colheaders, technologies_with_insufficient_records);
technology_data_filtered.transformed_data = rmfield( technology_data_filtered.transformed_data, technologies_with_insufficient_records);
technology_data_filtered.smoothed_data = rmfield(technology_data_filtered.smoothed_data, technologies_with_insufficient_records);
technology_data_filtered.trimmed_data = rmfield(technology_data_filtered.trimmed_data, technologies_with_insufficient_records);
technology_data_filtered.normalised_data = rmfield( technology_data_filtered.normalised_data, technologies_with_insufficient_records);
technology_data_filtered.all_TLC_stages = technology_data_filtered.normalised_data;
elements_to_remove = ismember(all_technologies, technologies_with_insufficient_records); technology_data_filtered.known_cluster_id.filtered( elements_to_remove)
= [];
technology_data_filtered.known_cluster_id.all_TLC_stages =
technology_data_filtered.known_cluster_id.filtered;
% Filter datasets corresponding to technologies that are registered to
% have passed/be passing through the emergence stage of the Technology Life
% Cycle:
technologies_to_exclude_emergence = [all_technologies(max_TLC_stage < 1); all_technologies(min_TLC_stage > 1)];
technology_data_filtered.emergence = rmfield(technology_data.normalised_data, union(technologies_with_insufficient_records, technologies_to_exclude_emergence));
elements_to_remove_emergence = ismember(all_technologies, union(technologies_with_insufficient_records, technologies_to_exclude_emergence));
technology_data_filtered.known_cluster_id.emergence = technology_data.known_cluster_id.all_technologies;
technology_data_filtered.known_cluster_id.emergence( elements_to_remove_emergence) = [];
% Filter datasets corresponding to technologies that are registered to
% have passed/be passing through the growth stage of the Technology Life
% Cycle:
technologies_to_exclude_growth = [all_technologies(max_TLC_stage < 2); all_technologies(min_TLC_stage > 2)];
technology_data_filtered.growth = rmfield(technology_data.normalised_data, union(technologies_with_insufficient_records, technologies_to_exclude_growth));
elements_to_remove_growth = ismember(all_technologies, union(technologies_with_insufficient_records, technologies_to_exclude_growth));
technology_data_filtered.known_cluster_id.growth = technology_data.known_cluster_id.all_technologies;
technology_data_filtered.known_cluster_id.growth( elements_to_remove_growth) = [];
% Filter datasets corresponding to technologies that are registered to
% have passed/be passing through the maturity stage of the Technology Life
% Cycle:
technologies_to_exclude_maturity = [all_technologies(max_TLC_stage < 3); all_technologies(min_TLC_stage > 3)];
technology_data_filtered.maturity = rmfield(technology_data.normalised_data, union(technologies_with_insufficient_records, technologies_to_exclude_maturity));
elements_to_remove_maturity = ismember(all_technologies, union(technologies_with_insufficient_records, technologies_to_exclude_maturity));
technology_data_filtered.known_cluster_id.maturity = technology_data.known_cluster_id.all_technologies;
technology_data_filtered.known_cluster_id.maturity( elements_to_remove_maturity) = [];
% Filter datasets corresponding to technologies that are registered to
% have passed/be passing through the decline stage of the Technology Life
% Cycle:
technologies_to_exclude_decline = [all_technologies(max_TLC_stage < 4); all_technologies(min_TLC_stage > 4)];
technology_data_filtered.decline = rmfield(technology_data.normalised_data, union(technologies_with_insufficient_records, technologies_to_exclude_decline));
elements_to_remove_decline = ismember(all_technologies, union(technologies_with_insufficient_records, technologies_to_exclude_decline));
technology_data_filtered.known_cluster_id.decline = technology_data.known_cluster_id.all_technologies;
technology_data_filtered.known_cluster_id.decline( elements_to_remove_decline) = [];
% Extract list of technologies included:
technology.all_TLC_stages = fieldnames(technology_data_filtered.data);
technology.emergence = fieldnames(technology_data_filtered.emergence);
technology.growth = fieldnames(technology_data_filtered.growth);
technology.maturity = fieldnames(technology_data_filtered.maturity);
technology.decline = fieldnames(technology_data_filtered.decline);
% Compile list of technology life cycle stages considered:
TLC_stages = fieldnames(technology);
% Limit data points within these datasets to only those points classed as
% being during the 'emergence' technology life cycle stage:
for i = 1:numel(technology.emergence)
    technology_data_filtered.emergence.(technology.emergence{i}) = technology_data_filtered.emergence.(technology.emergence{i}) ((technology_data_filtered.emergence.(technology.emergence{i}) (:,TLC_stages_column_idx) == 1),:);
end
% Limit data points within these datasets to only those points classed as
% being during the 'growth' technology life cycle stage:
for i = 1:numel(technology.growth)
    technology_data_filtered.growth.(technology.growth{i}) = technology_data_filtered.growth.(technology.growth{i}) ((technology_data_filtered.growth.(technology.growth{i}) (:,TLC_stages_column_idx) == 2),:);
end
% Limit data points within these datasets to only those points classed as
% being during the 'maturity' technology life cycle stage:
for i = 1:numel(technology.maturity)
    technology_data_filtered.maturity.(technology.maturity{i}) = technology_data_filtered.maturity.(technology.maturity{i}) ((technology_data_filtered.maturity.(technology.maturity{i}) (:,TLC_stages_column_idx) == 3),:);
end
% Limit data points within these datasets to only those points classed as
% being during the 'decline' technology life cycle stage:
for i = 1:numel(technology.decline)
    technology_data_filtered.decline.(technology.decline{i}) = technology_data_filtered.decline.(technology.decline{i}) ((technology_data_filtered.decline.(technology.decline{i}) (:,TLC_stages_column_idx) == 4),:);
end
% Store patent indicator column names:
patent_indicator_column_names = {'patents by application year'; 'patents by priority year';...
    'corporates by priority year'; 'non corporates by priority year';...
    'set of inventors by priority year'; 'cited references by priority year';...
    'cited patents by priority year'; 'distinct IPC subclass by priority year';...
    'top 5 IPC subclass patents by priority year'; 'top 10 IPC subclass patents by priority year'};
% Set figures generated to be invisible by default:
set(0,'DefaultFigureVisible','off')
% Determine screensize so that figures are scaled to the right size for the
% current monitor (scrsz == screen size vector [left, bottom, width,
% height])
scrsz = get(0,'ScreenSize');
%% Move to the relevant 'Statistics results' folder
% (UF = unfiltered, F = filtered, IHS = Inverted Hyperbolic Sine function
% applied, S = smoothed and normalised, sub = using subsets of patent
% indicators, LOOCV = 'leave-one-out' cross-validation, LHOCV =
% 'leave-half-the-technologies-out' cross-validation):
% cd('Statistics (Final - L1OCV)')
cd('Statistics (Final - L1OCV -FOSC)')
% cd('Statistics (Final - L2OCV)')
% cd('Statistics (Final - L3OCV)')
% cd('Statistics (Final - L4OCV)')
% cd('Statistics (Final - L5OCV)')
% cd('Statistics (Final - L6OCV)')
% cd('Statistics (Test)')
%% Preallocate empty double and cell arrays for storing distance data:
for stage = 1:numel(TLC_stages)
    DTW_distance_subset_indicator.(TLC_stages{stage}) = zeros(numel(technology.(TLC_stages{stage})), numel(technology.(TLC_stages{stage})), total_num_indicator_subsets);
    current_technology_warping_path.(TLC_stages{stage}) = cell(numel(technology.(TLC_stages{stage})), numel(technology.(TLC_stages{stage})), total_num_indicator_subsets);
    comparison_technology_warping_path.(TLC_stages{stage}) = cell(numel(technology.(TLC_stages{stage})), numel(technology.(TLC_stages{stage})), total_num_indicator_subsets);
end
%% Determine relative distances between each respective pair of technology patent indicator curves generated when comparing each technology to every other technology for each of the patent indicator subsets considered, for each Technology Life Cycle stage:
% Evaluate each TLC stage separately:
for stage = 1:numel(TLC_stages)
    % This is done using the Dynamic Time Warping (DTW) approach for each
    % of the technologies included in the patent indicators dataset:
    for i = 1:numel(technology.(TLC_stages{stage}))
        % Select the current indicator set for analysis:
        current_technology_data = technology_data_filtered.( TLC_stages{stage}).(technology.(TLC_stages{stage}){i});
        % Iterate through the comparison technology indicator sets:
        for j = 1:numel(technology.(TLC_stages{stage}))
            % Select the technology indicator set to measure distance
            % against:
            comparison_technology_data = technology_data_filtered.( TLC_stages{stage}).(technology.(TLC_stages{stage}){j});
            % Iterate through each indicator subset to base clustering on:
            for k = 1:total_num_indicator_subsets
                % Select the current patent indicator subset to measure
                % distances against:
                current_indicator_subset = indicator_subset_list{k,1};
                % Generate a new figure for comparing original and aligned
                % signals:
%                 figure_name = ['DTW of ',technology.(TLC_stages{stage}){j}, ' relative to ',technology.(TLC_stages{stage}){i}, ' - ', (TLC_stages{stage}), ' (INDICATOR SUBSET ',num2str(k),')'];
%                 figure('Name',figure_name, 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
                % Calculate the distance between technologies using only
                % the selected subset of patent indicators:
                [DTW_distance_subset_indicator.(TLC_stages{stage})(i,j,k), current_technology_warping_path.(TLC_stages{stage}) {i,j,k}(1,:), comparison_technology_warping_path.(TLC_stages{stage}) {i,j,k}(1,:)] = dtw(current_technology_data(:,current_indicator_subset)', comparison_technology_data(:,current_indicator_subset)');
%                 dtw(current_technology_data(:, current_indicator_subset)', comparison_technology_data(:, current_indicator_subset)');
                % Set subplots to be invisible now, but visible when opened
                % later:
%                 set(gcf,'Visible','off','CreateFcn','set(gcf,''Visible'',''on'')')       
%                 saveas(gcf,figure_name,'fig')           
            end
        end
    %     toc()
    end
    % toc()
   
end
% toc()
%% Save all variables to a MAT file:
save('statistical_analysis.mat')
toc()
%% Set figures generated to be visible again by default:
% set(0,'DefaultFigureVisible','on')
%% Cluster the technology time series either based on individual patent indicator distances, or by considering distances measured when considering all (or a subset) of patent indicators simultaneously.
%% Determine technology clusters based on considering a specified subset of patent indicators simultaneously:
% Preallocate empty double and cell arrays for storing cluster ID results
% for each subset:
for stage = 1:numel(TLC_stages)
    cluster_ID_subset.(TLC_stages{stage}) = zeros(numel(technology.(TLC_stages{stage})), total_num_indicator_subsets);
    medoid_locations_idx_subset.(TLC_stages{stage}) = zeros(num_clusters, total_num_indicator_subsets);
    cluster_percentage_difference.(TLC_stages{stage}) = zeros(1, total_num_indicator_subsets);
    realigned_cluster_ID_subsets.(TLC_stages{stage}) = zeros(numel(technology.(TLC_stages{stage})), total_num_indicator_subsets);
    significance.(TLC_stages{stage}) = zeros(1, total_num_indicator_subsets);
    p_value.(TLC_stages{stage}) = zeros(1, total_num_indicator_subsets);
    group_maps.(TLC_stages{stage}) = cell(1, total_num_indicator_subsets);
    confusion_matrix.(TLC_stages{stage}) = cell(1, total_num_indicator_subsets);
    p_value_stats.(TLC_stages{stage}) = cell(1, total_num_indicator_subsets);
end
% Evaluate each TLC stage separately:
for stage = 1:numel(TLC_stages)
    for k = 1:total_num_indicator_subsets
        % Determine K-Medoid clusters for the current subset of patent
        % indicators:
        [cluster_ID_subset.(TLC_stages{stage})(:,k),medoid_locations_subset, within_cluster_sum_subset, distance_to_medoid_subset, medoid_locations_idx_subset.(TLC_stages{stage})(:,k), info_subset] = kmedoids(DTW_distance_subset_indicator.(TLC_stages{stage})(:,:,k), num_clusters, 'Algorithm', 'pam');
        % Realign group mappings in the predicted cluster ID results to
        % match those of the known cluster groups:
        group_mappings_A = cluster_ID_subset.(TLC_stages{stage})(:,k);
        group_mappings_B = technology_data_filtered.known_cluster_id.(TLC_stages{stage});
        [group_map, realigned_predicted_cluster_IDs] = group_mappings(group_mappings_A, group_mappings_B);
        group_maps.(TLC_stages{stage}){k} = group_map;
        realigned_cluster_ID_subsets.(TLC_stages{stage})(:,k) = realigned_predicted_cluster_IDs';
        % Calculate the confusion matrix for the current specified subset
        % of indicators:
        confusion_matrix.(TLC_stages{stage}){k} = confusionmat(technology_data_filtered.known_cluster_id.( TLC_stages{stage}), realigned_predicted_cluster_IDs);
        % Use Fisher's exact test to determine the statistical significance
        % (i.e. the two-tail p-value) of the current specified subset of
        % indicators. This tests the null hypothesis that the current
        % observed group labels are not related to the predicted (i.e.
        % literature-based) group labels. As such a p-value of less than
        % 0.05 implies that the null hypothesis is rejected, and that there
        % is a statistical significance between the current specified
        % patent indicator subset and the expected groupings:
        if num_clusters == 2
            [significance.(TLC_stages{stage})(k), p_value.(TLC_stages{stage})(k), p_value_stats.(TLC_stages{stage}){k}] = fishertest(confusion_matrix.(TLC_stages{stage}){k});
        elseif num_clusters == 3
            p_value.(TLC_stages{stage})(k) = myfisher33(confusion_matrix.(TLC_stages{stage}){k});
            if p_value.(TLC_stages{stage})(k) <= 0.05
                significance.(TLC_stages{stage})(k) = 1;
            else
                significance.(TLC_stages{stage})(k) = 0;
            end
        end
        % Create target and output matrices for use in confusion plot for
        % the current specified subset of indicators:
        for j = 1:num_clusters
            if j == 1
                targets = (technology_data_filtered.known_cluster_id.(TLC_stages{stage}) == 1)';
                outputs = realigned_predicted_cluster_IDs == 1;
            else
                targets = [targets; (technology_data_filtered.known_cluster_id.(TLC_stages{stage}) == j)'];
                outputs = [outputs; realigned_predicted_cluster_IDs == j];
            end
        end
        % Convert target and output matrices to double arrays:
        targets = double(targets);
        outputs = double(outputs);
        % Plot the classification confusion matrix for the current
        % specified subset of indicators:
%         figure_name = ['Classification confusion matrix based on patent indicators subset ',num2str(k),' - ',TLC_stages{stage}];
%         figure('Name', figure_name, 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
%         hold on;
%         plotconfusion(targets,outputs);
%
%         % Add title to figure:
%         title(figure_name);
%
%         % Set subplots to be invisible now, but visible when opened later:
%         set(gcf,'Visible','off','CreateFcn','set(gcf,''Visible'',''on'')')       
%
%         % Save figure:
%         saveas(gcf,figure_name,'fig');
%         hold off;
        % Compare the predicted cluster IDs to the known cluster IDs
        % (taken from literature evidence) using hamming distance between
        % vectors:
        cluster_percentage_difference.(TLC_stages{stage})(k) = pdist2(technology_data_filtered.known_cluster_id.( TLC_stages{stage})', realigned_predicted_cluster_IDs, 'hamming');
        % Plot the time series clusters based on considering a specified
        % subset of patent indicators simultaneously:
%         figure_name = ['K-medoids clustering of technologies based on patent indicators subset ',num2str(k),' - ',TLC_stages{stage},' (',patent_indicator_column_names{1},')'];
%         figure('Name', figure_name, 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
%         hold on;
%
%         for j = 1:numel(technology.(TLC_stages{stage}))
%             if realigned_predicted_cluster_IDs(j) == 1
%                 plot(technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){j})(:,11), technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){j})(:,1),'r-');
%             elseif realigned_predicted_cluster_IDs(j) == 2
%                 plot(technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){j})(:,11), technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){j})(:,1),'b-');
%             elseif realigned_predicted_cluster_IDs(j) == 3
%                 plot(technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){j})(:,11), technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){j})(:,1),'g-');           
%             end
%         end
%
%         % Plot the medoid time series when considering a specified subset
%         % of patent indicators simultaneously:
%         for j = 1:num_clusters
%             plot(technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){medoid_locations_idx_subset.( TLC_stages{stage})(j,k)})(:,11), technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}){medoid_locations_idx_subset.( TLC_stages{stage})(j,k)})(:,1), 'co','MarkerSize',7,'LineWidth',1.5);
%         end
%
%         % Add legend to the technology time series clusters:
%         legend(strrep(technology.(TLC_stages{stage}),'_',' '), 'FontSize',12,'Location','northwest');
%
%         % Add title to figure:
%         title(figure_name);
%
%         % Set subplots to be invisible now, but visible when opened later:
%         set(gcf,'Visible','off','CreateFcn','set(gcf,''Visible'',''on'')')       
%
%         % Save figure:
%         saveas(gcf,figure_name,'fig');
%         hold off;
    end
%     toc()
end
toc()
%% Set figures generated to be visible again by default:
set(0,'DefaultFigureVisible','on')
%% Plot histograms based on the minimum difference to the known cluster IDs, as well as those indicator subsets that are statistically significant:
% Evaluate each TLC stage separately:
for stage = 1:numel(TLC_stages)
    % Locate indicator subsets giving the minimum difference to the known
    % cluster IDs, as well as those indicator subsets that are
    % statistically significant:
    [min_cluster_difference_value, min_cluster_difference_index] = min(cluster_percentage_difference.(TLC_stages{stage}));
    min_difference_subsets.(TLC_stages{stage}) = indicator_subset_list(find( cluster_percentage_difference.(TLC_stages{stage}) == min_cluster_difference_value));
    significance_logical = logical(significance.(TLC_stages{stage}));
    statistically_significant_subsets.(TLC_stages{stage}) = indicator_subset_list(significance_logical);
    % Compare lists of minimum difference and statistically significant
    % subsets:
    common_subsets_stat_significance.(TLC_stages{stage}) = ismember(cellfun(@num2str, statistically_significant_subsets.(TLC_stages{stage}), 'UniformOutput',0), cellfun(@num2str, min_difference_subsets.(TLC_stages{stage}), 'UniformOutput',0));
    common_subsets_min_difference.(TLC_stages{stage}) = ismember(cellfun(@num2str, min_difference_subsets.(TLC_stages{stage}), 'UniformOutput',0), cellfun(@num2str, statistically_significant_subsets.(TLC_stages{stage}), 'UniformOutput',0));
    % Plot a histogram of the most frequently occuring indicators in
    % identified best patent indicator subsets:
    figure_name = ['Histogram of most frequently occuring patent indicators in minimum hamming difference cluster predictions - ',TLC_stages{stage}];
    figure('Name', figure_name, 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
    hold on;
    histogram([min_difference_subsets.(TLC_stages{stage}) {1:end}]);
    % Add title, X, and Y labels to figure:
    title(figure_name);
    xlabel('Patent indicator','FontSize',12)
    ylabel('Count','FontSize',12)
    % Save figure:
    saveas(gcf,figure_name,'fig');
    hold off;
    % Plot a histogram of the most frequently occuring indicators in
    % identified statistically significant patent indicator subsets:
    figure_name = ['Histogram of most frequently occuring patent indicators in statistically significant cluster predictions - ',TLC_stages{stage}];
    figure('Name', figure_name, 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
    hold on;
    histogram([statistically_significant_subsets.(TLC_stages{stage}) {1:end}]);
    % Add title, X, and Y labels to figure:
    title(figure_name);
    xlabel('Patent indicator','FontSize',12)
    ylabel('Count','FontSize',12)
    % Save figure:
    saveas(gcf,figure_name,'fig');
    hold off;
end
%% Save all variables to a MAT file:
save('statistical_analysis.mat');
toc()
%% Run 'leave-p-out' cross-validation:
% The sections of the script that follow look at conducting leave-p-out
% cross-validation of each of the patent indicator subsets that have been
% identified as providing a statistically significant correlation to the
% expected technology groupings. The analysis in the sections below works
% on leave-half-the-technologies-out cross-validation (to give the largest
% sample for cross-validation). Alternatively, leave-one-out
% cross-validation can be used or any other leave-p-out variant by
% specifiying an alternative number of technologies to omit from the
% training subset permutations (see line where
% 'training_technology_subsets' are defined). This follows on from
% discussions with the Bristol University Statistics Clinic on the
% 22_03_17.
%% Determine relative distances between each respective pair of technology patent indicator curves generated when comparing each technology to every other technology for each of the statistically significant patent indicator subsets for the current technology training dataset:
% Evaluate each TLC stage separately:
for stage = 1:numel(TLC_stages)
    % Create training technology subsets based on
    % leave-half-the-technologies-out cross-validation:
%     training_technology_subsets.( TLC_stages{stage}) = nchoosek(1:numel(technology.( TLC_stages{stage})), numel(technology.( TLC_stages{stage})) - floor(numel(technology.( TLC_stages{stage}))/2));
    % Alternatively, create training technology subsets based on
    % leave-p-out cross-validation:
    training_technology_subsets.( TLC_stages{stage}) = nchoosek(1:numel(technology.( TLC_stages{stage})), numel(technology.( TLC_stages{stage})) - num_technologies_to_remove);
    % Determine test technology subsets based on the corresponding training
    % technology subsets:
    test_technology_subsets.( TLC_stages{stage}) = zeros(size(training_technology_subsets.( TLC_stages{stage}),1), numel(technology.( TLC_stages{stage})) - size(training_technology_subsets.( TLC_stages{stage}),2));
    for i = 1:size(training_technology_subsets.(TLC_stages{stage}),1)
        test_technology_subsets.( TLC_stages{stage})(i,:) = setdiff(1:numel(technology.( TLC_stages{stage})), training_technology_subsets.( TLC_stages{stage})(i,:));
    end
    % Extract the total number of statistically significant patent
    % indicator subset combinations:
    total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}) = numel(statistically_significant_subsets.( TLC_stages{stage}));
    % Create matrices corresponding to training and test technology subset
    % indices respectively:
    training_technology_subsets_idx.( TLC_stages{stage}) = zeros(size(training_technology_subsets.( TLC_stages{stage}),2), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(training_technology_subsets.( TLC_stages{stage}),1));
    test_technology_subsets_idx.( TLC_stages{stage}) = zeros(size(test_technology_subsets.( TLC_stages{stage}),2), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    for i = 1:size(training_technology_subsets.( TLC_stages{stage}),1)
        training_technology_subsets_idx.( TLC_stages{stage})(:,:,i) = repmat(training_technology_subsets.( TLC_stages{stage})(i,:)', 1,total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}));
        test_technology_subsets_idx.( TLC_stages{stage})(:,:,i) = repmat(test_technology_subsets.( TLC_stages{stage})(i,:)', 1,total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}));
    end
    % Preallocate empty double arrays for storing distance data:
    DTW_distance_training_technology_subset_indicator.( TLC_stages{stage}) = zeros(floor(numel(technology.( TLC_stages{stage}))/2), floor(numel(technology.( TLC_stages{stage}))/2), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(training_technology_subsets.( TLC_stages{stage}),1));
    % Determine relative distances between each respective pair of
    % technology patent indicator curves generated when comparing each
    % technology to every other technology for each of the statistically
    % significant patent indicator subsets for the current technology
    % training dataset. This is done using the Dynamic Time Warping (DTW)
    % approach for each of the technologies included in the patent
    % indicators dataset:
    for training_technology_subset = 1:size(training_technology_subsets.( TLC_stages{stage}),1)
        % Select the current training technology subset for analysis:
        current_training_technology_subset = training_technology_subsets.( TLC_stages{stage}) (training_technology_subset,:);
        % Iterate through the current training technology indicator sets:
        for i = 1:numel(current_training_technology_subset)       
            % Select the current indicator set for analysis:
            current_technology_data = technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}) {current_training_technology_subset(i)});
            % Iterate through the comparison technology indicator sets:
            for j = 1:numel(current_training_technology_subset)
                % Select the technology indicator set to measure distance
                % against:
                comparison_technology_data = technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}) {current_training_technology_subset(j)});
                % Iterate through each statistically significantly
                % indicator subset to base clustering on:
                for k = 1:total_num_statistically_significant_indicator_subsets.( TLC_stages{stage})
                    % Select the current statistically significant
                    % indicator subset to measure distances against:
                    current_indicator_subset = statistically_significant_subsets.( TLC_stages{stage}){k,1};
                    % Calculate the distance between training technologies
                    % using only the selected statistically significant
                    % subset of indicators (as symettric matrix, only need
                    % to calculate the upper diagonal):
                    if i <= j
                        DTW_distance_training_technology_subset_indicator.( TLC_stages{stage})(i,j,k, training_technology_subset) = dtw(current_technology_data(:, current_indicator_subset)', comparison_technology_data(:, current_indicator_subset)');
                    else
                        DTW_distance_training_technology_subset_indicator.( TLC_stages{stage})(i,j,k, training_technology_subset) = DTW_distance_training_technology_subset_indicator.( TLC_stages{stage})(j,i,k, training_technology_subset);
                    end
                end
            end
        end
    %     toc()
    end
%     toc()
   
end
toc()
%% Save all variables to a MAT file:
save('statistical_analysis.mat');
%% Determine technology clusters based on the current training technology subset, whilst only considering statistically significant subsets of patent indicators simultaneously:
% Evaluate each TLC stage separately:
for stage = 1:numel(TLC_stages)
    % Preallocate empty double and cell arrays for storing cluster ID
    % results and distance data for each cross-validation subset:
    cluster_ID_training_technologies.( TLC_stages{stage}) = zeros(size(training_technology_subsets.( TLC_stages{stage}),2), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    medoid_locations_idx_cross_validation.( TLC_stages{stage}) = zeros(num_clusters, total_num_statistically_significant_indicator_subsets.(TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    DTW_distance_test_technology_subset_indicator.( TLC_stages{stage}) = zeros(num_clusters, size(test_technology_subsets.( TLC_stages{stage}),2), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    cluster_ID_test_technologies.( TLC_stages{stage}) = zeros(size(test_technology_subsets.( TLC_stages{stage}),2), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    cluster_ID_cross_validation.(TLC_stages{stage}) = zeros(numel(technology.( TLC_stages{stage})), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    realigned_cluster_ID_cross_validation.( TLC_stages{stage}) = zeros(numel(technology.( TLC_stages{stage})), total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    num_misclassified_test_technologies.( TLC_stages{stage}) = zeros(total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    group_maps_cross_validation.( TLC_stages{stage}) = cell(total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}), size(test_technology_subsets.( TLC_stages{stage}),1));
    % Iterate through technology test subsets:
    for test_technology_subset = 1:size(test_technology_subsets.( TLC_stages{stage}),1)
        % Select the current test technology subset for analysis:
        current_test_technology_subset = test_technology_subsets.( TLC_stages{stage})(test_technology_subset,:);
        % Iterate through subsets of statistically significant patent
        % indicators:
        for k = 1:total_num_statistically_significant_indicator_subsets.( TLC_stages{stage})
            % Determine K-Medoid clusters for the current statistically
            % significant patent indicators subset based on current
            % training technology subset:
            [cluster_ID_training_technologies.( TLC_stages{stage})(:,k,test_technology_subset), medoid_locations_cross_validation, within_cluster_sum_cross_validation, distance_to_medoid_cross_validation, medoid_locations_idx_cross_validation.( TLC_stages{stage})(:,k,test_technology_subset), info_cross_validation] = kmedoids(DTW_distance_training_technology_subset_indicator.( TLC_stages{stage})(:,:,k, test_technology_subset), num_clusters, 'Algorithm', 'pam');
            % Select the current statistically significant indicator subset
            % to measure distances against:
            current_indicator_subset = statistically_significant_subsets.( TLC_stages{stage}){k,1};
            % Iterate through the current test technology indicator sets:
            for i = 1:numel(current_test_technology_subset)       
                % Select the current indicator set for analysis:
                current_technology_data = technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}) {current_test_technology_subset(i)});
                % Iterate through the comparison medoid technologies:
                for j = 1:num_clusters
                    % Select the technology indicator set to measure
                    % distance against:
                    comparison_medoid_technology_data = technology_data_filtered.( TLC_stages{stage}).(technology.( TLC_stages{stage}) {medoid_locations_idx_cross_validation.( TLC_stages{stage}) (j,k,test_technology_subset)});
                    % Use Dynamic Time Warping to determine the distance
                    % between each technology in the current test
                    % technology subset and the medoids located using the
                    % training technology subsets:
                    DTW_distance_test_technology_subset_indicator.( TLC_stages{stage})(j,i,k, test_technology_subset) = dtw(current_technology_data(:, current_indicator_subset)', comparison_medoid_technology_data(:, current_indicator_subset)');
                end
                % Use the minimum distance to determine the cluster label
                % to apply to the current test technology (aligned to group
                % labels assigned based on training technology subsets -
                % NB: the current group labels will subsequently need to be
                % realigned to match the known cluster IDs, but this is
                % done after all the test technologies have been mapped to
                % the existing groupings to avoid any unnecessary group
                % label translation errors at this stage):
                [~,nearest_medoid_idx] = min(DTW_distance_test_technology_subset_indicator.( TLC_stages{stage})(:,i,k, test_technology_subset));
                cluster_ID_test_technologies.( TLC_stages{stage})(i,k, test_technology_subset) = cluster_ID_training_technologies.( TLC_stages{stage})(medoid_locations_idx_cross_validation.( TLC_stages{stage})(nearest_medoid_idx, k, test_technology_subset), k, test_technology_subset);           
            end
            % Recombine training and test technology cluster IDs in their
            % correct order:
            cluster_ID_cross_validation.( TLC_stages{stage})(training_technology_subsets_idx.( TLC_stages{stage})(:, k, test_technology_subset), k, test_technology_subset) = cluster_ID_training_technologies.( TLC_stages{stage})(:, k, test_technology_subset);
            cluster_ID_cross_validation.( TLC_stages{stage})(test_technology_subsets_idx.( TLC_stages{stage})(:, k, test_technology_subset), k, test_technology_subset) = cluster_ID_test_technologies.( TLC_stages{stage})(:, k, test_technology_subset);
            % Realign group mappings in the predicted cluster ID results to
            % match those of the known cluster groups:
            [group_map, realigned_predicted_cluster_IDs] = group_mappings(cluster_ID_cross_validation.( TLC_stages{stage})(:, k, test_technology_subset), technology_data_filtered.known_cluster_id.( TLC_stages{stage}));
            group_maps_cross_validation.( TLC_stages{stage}){k, test_technology_subset} = group_map;
            realigned_cluster_ID_cross_validation.( TLC_stages{stage})(:,k, test_technology_subset) = realigned_predicted_cluster_IDs';
            % Calculate the number of test technologies that have been
            % misclustered based on the current training technology subset,
            % using the current statistically significant subset of patent
            % indicators:
            predicted_test_cluster_IDs = realigned_cluster_ID_cross_validation.( TLC_stages{stage})(test_technology_subsets_idx.( TLC_stages{stage})(:,k, test_technology_subset), k, test_technology_subset);
            corresponding_known_cluster_IDs = technology_data_filtered.known_cluster_id.( TLC_stages{stage})(test_technology_subsets_idx.( TLC_stages{stage})(:, k, test_technology_subset));
            misclassified_test_technologies = predicted_test_cluster_IDs ~= corresponding_known_cluster_IDs;
            num_misclassified_test_technologies.( TLC_stages{stage})(k, test_technology_subset) = sum(misclassified_test_technologies);
        end
    %     toc()
    end
%     toc()
   
end
toc()
%% Rank the statistically significant patent indicator subsets based on their respective ability to predict the correct technology groupings using different training technologies:
% Evaluate each TLC stage separately:
for stage = 1:numel(TLC_stages)
    % Calculate the average number of test technologies misclassified for
    % each statistically significant patent indicator subset for different
    % training technology subsets:
    statistically_significant_subsets_avg_misclassified.( TLC_stages{stage}) = sum(num_misclassified_test_technologies.( TLC_stages{stage}),2) / size(num_misclassified_test_technologies.( TLC_stages{stage}),2);
    % Rank each statistically significant patent indicator subset based on
    % the average number of test technologies misclassified for different
    % training technology subsets:
    [statistically_significant_subsets_sorted.( TLC_stages{stage}), statistically_significant_subsets_rank.( TLC_stages{stage})] = sort(statistically_significant_subsets_avg_misclassified.( TLC_stages{stage}));
    % Locate the statistically significant patent indicator subsets giving
    % the minimum average number of test technologies misclassified for
    % different training technology subsets:
    num_min_avg_misclassified.( TLC_stages{stage}) = sum(statistically_significant_subsets_sorted.( TLC_stages{stage}) == min(statistically_significant_subsets_sorted.( TLC_stages{stage})));
    statistically_significant_subsets_min_avg_misclassified.( TLC_stages{stage}) = statistically_significant_subsets.( TLC_stages{stage})(statistically_significant_subsets_rank.( TLC_stages{stage})(1:num_min_avg_misclassified.( TLC_stages{stage})));
    % Locate the top 10% of statistically significant patent indicator
    % subsets:
    if total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}) > 0
        statistically_significant_subsets_top_ten_percent.( TLC_stages{stage}) = statistically_significant_subsets.( TLC_stages{stage})(statistically_significant_subsets_rank.( TLC_stages{stage})(1:ceil(0.1 * total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}))));
    end
end
%% Plot histograms based on best performing cross-validated patent indicator subsets:
% Evaluate each TLC stage separately:
for stage = 1:numel(TLC_stages)
    % Plot a histogram of the most frequently occuring indicators in
    % identified best patent indicator subsets:
    figure_name = ['Histogram of most frequently occuring patent indicators in best performing cross-validated patent indicator subsets - ', TLC_stages{stage}];
    figure('Name', figure_name, 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
    hold on;
    histogram([statistically_significant_subsets_min_avg_misclassified.( TLC_stages{stage}){1:end}]);
    % Add title, X, and Y labels to figure:
    title(figure_name);
    xlabel('Patent indicator','FontSize',12)
    ylabel('Count','FontSize',12)
    % Save figure:
    saveas(gcf,figure_name,'fig');
    hold off;
   
    if total_num_statistically_significant_indicator_subsets.( TLC_stages{stage}) > 0
        % Plot a histogram of the most frequently occuring indicators in
        % identified best patent indicator subsets:
        figure_name = ['Histogram of most frequently occurring patent indicators in top ten percent of statistically significant cross-validated subsets - ', TLC_stages{stage}];
        figure('Name', figure_name, 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
        hold on;
        histogram([statistically_significant_subsets_top_ten_percent.( TLC_stages{stage}){1:end}]);
        % Add title, X, and Y labels to figure:
        title(figure_name);
        xlabel('Patent indicator','FontSize',12)
        ylabel('Count','FontSize',12)
        % Save figure:
        saveas(gcf,figure_name,'fig');
        hold off;
    end
end
%% Save all variables to a MAT file:
save('statistical_analysis.mat');
%% Return to the patent data folder:
cd(patent_data_path)
toc()