Script 2: extract_patent_metrics.m
% This script is used to clean patent data files (already imported from
% HTML files using the 'batch_html_conversion.m' script and saved as .MAT
% files) and extract patent metrics required for use in subsequent 'nearest
% neighbour' pattern recognition analysis.
clearvars
clear all
close all
tic()
% Request the user to choose the current working directory (NB: if no
% folder is selected in the user interface then the current working
% directory is taken to be the filepath) OR do not ask the user and just
% take the current directory:
% filepath = uigetdir;
filepath = pwd;
% Identify the name of the selected working directory:
[upperPath,deepestFolder] = fileparts(filepath);
% Identify .MAT files in the selected working directory that match the name
% of the folder:
files = dir(strcat(deepestFolder,'*.mat'));
% Load list of International Patent Classification (IPC) subclasses:
load('IPC_subclasses.mat','IPC_subclasses')
% Initialise upper and lower global time limits:
upper_time_limit = inf;
lower_time_limit = -inf;
% Initialise the 'completed files' tracker:
completed_files = zeros(size(files,1),1);
% Preallocate empty cell arrays for building cumulative lists of unique
% corporate, non-corporate, and set of inventor entries:
unique_corporations_cumulative = cell(1,size(files,1));
unique_non_corporates_cumulative = cell(1,size(files,1));
unique_set_of_inventors_cumulative = cell(1,size(files,1));
% Preallocate empty cell arrays for compiling file summary counts:
patents_by_application_year = cell(1,size(files,1));
patents_by_priority_year = cell(1,size(files,1));
corporates_by_priority_year = cell(1,size(files,1));
non_corporates_by_priority_year = cell(1,size(files,1));
set_of_inventors_by_priority_year = cell(1,size(files,1));
cited_references_by_priority_year = cell(1,size(files,1));
cited_patents_by_priority_year = cell(1,size(files,1));
IPC_subclass_frequency_by_priority_year = cell(1,size(files,1));
distinct_IPC_subclass_by_priority_year = cell(1,size(files,1));
top_5_IPC_subclass_patents_by_priority_year = cell(1,size(files,1));
top_10_IPC_subclass_patents_by_priority_year = cell(1,size(files,1));
% Create array of table names:
table_names = {'patents_by_application_year'; 'patents_by_priority_year';...
    'corporates_by_priority_year'; 'non_corporates_by_priority_year';...
    'set_of_inventors_by_priority_year'; 'cited_references_by_priority_year';...
    'cited_patents_by_priority_year'; 'distinct_IPC_subclass_by_priority_year';...
    'top_5_IPC_subclass_patents_by_priority_year'; 'top_10_IPC_subclass_patents_by_priority_year'};
%% Iterate through input .MAT files in the current folder directory:
for file_ID = 1:numel(files)
% for file_ID = 2:4
% for file_ID = 2:2
% for file_ID = 16:16
% for file_ID = 22:22
    % Select the next .MAT file to convert:
    name = files(file_ID).name;
   
    % Load .MAT file containing patent data:
    load(name, 'cell_array')
    % Select cell array containing the patent data to evaluate:
    data_set = 'cell_array';
    % Create table object from cell array data set:
    data_table = cell2table(eval(strcat(data_set, '(2:end,:)')));
    % Replace any ' ' values with a blank table cell:
    data_table = standardizeMissing(data_table,' ');
    % Determine the number of rows in the data table (not including
    % headers):
    rows = size(data_table,1);
    % Read in table headings as stored in cell array:
    headings = eval(strcat(data_set, '(1,:)'));
    % Remove any leading and trailing white spaces from headings:
    headings = strtrim(headings);
    % Remove or replace any remaining white spaces, slashes, brackets, full
    % stops, or hyphens in headings with underscores:
    headings = strrep(headings,' ','_');
    headings = strrep(headings,'/','_');
    headings = strrep(headings,'(','');
    headings = strrep(headings,')','');
    headings = strrep(headings,'.','');
    headings = strrep(headings,'-','_');
    % Identify headings that are unique (in the same order that they
    % currently appear in):
    [~,unique_headings_index,~] = unique(headings,'stable');
    unique_headings = ismember(1:size(headings,2),unique_headings_index);
    for i = 1:size(headings,2)
        if unique_headings(i) == 0
            headings(i) = strcat(headings(i),'_',num2str(i));
        end
    end
    % Assign headings to table variables:
    data_table.Properties.VariableNames = headings;
    % Add pivot table counter column to table:
    data_table.Pivot_Table_Counter = ones(rows,1);
    % Identify valid date entries:
    missing_dates = strcmp(data_table.Date,'0');
    missing_dates_index = find(missing_dates == 1);
    valid_dates = ~missing_dates;
    % Extract 'Basic Year' for each patent family:
    data_table.Basic_Year(valid_dates,1) = year(data_table.Date(valid_dates));
    % Convert required cell arrays into character arrays:
    current_assignee_name = char(data_table.Current_Applicant__or_Assignee_Name);
    family_normalized_assignee_name = char(data_table.Family_Normalized_Assignee_name);
    set_of_inventors = char(data_table.Inventor);
    % Define string truncation limits:
    char_limit_1 = 12;
    char_limit_2 = 13;
    % Preallocate empty arrays and tables for derived metrics:
    application_date = cell(rows,2);
    cited_references = zeros(rows,1);
    cited_patents = zeros(rows,1);
    matched_IPC_subclasses = array2table(zeros(rows,size(IPC_subclasses,1)),'VariableNames', IPC_subclasses);
    % Truncate data entries in 'Current Applicant or Assignee Name',
    % 'Family Normalized Assignee name', and 'Inventor' columns in order to
    % improve identification of unique entries:
    truncated_current_assignee_name = cellstr(current_assignee_name(:,1:char_limit_1));
    truncated_family_normalized_assignee_name = cellstr(family_normalized_assignee_name(:,1:char_limit_2));
    truncated_set_of_inventors = cellstr(set_of_inventors(:,1:char_limit_1));
    % Assign unique corporation ID to companies appearing in truncated sets
    % of data:
    if sum(completed_files) == 0
        unique_corporations_cumulative{file_ID} = unique(truncated_family_normalized_assignee_name);
        unique_corporations = unique_corporations_cumulative{file_ID};
    else
        new_unique_corporations = unique(truncated_family_normalized_assignee_name);
        new_unique_corporations_idx = find(~ismember(new_unique_corporations,unique_corporations));
        unique_corporations_cumulative{file_ID} = new_unique_corporations(new_unique_corporations_idx);
        unique_corporations = [unique_corporations;unique_corporations_cumulative{file_ID}];
    end
    [~,data_table.Corporation_ID] = ismember(truncated_family_normalized_assignee_name(:,1), unique_corporations);
    % Assign unique non-corporate ID to patents that are not associated
    % with a recognised company name appearing in the truncated sets of
    % data:   
    non_corporates_idx = find(strcmp(truncated_family_normalized_assignee_name, '') == 1);
    non_corporates = truncated_current_assignee_name(non_corporates_idx);
    if sum(completed_files) == 0
        unique_non_corporates_cumulative{file_ID} = unique(non_corporates);
        unique_non_corporates = unique_non_corporates_cumulative{file_ID};
    else
        new_unique_non_corporates = unique(non_corporates);
        new_unique_non_corporates_idx = find(~ismember(new_unique_non_corporates, unique_non_corporates));
        unique_non_corporates_cumulative{file_ID} = new_unique_non_corporates(new_unique_non_corporates_idx);
        unique_non_corporates = [unique_non_corporates; unique_non_corporates_cumulative{file_ID}];
    end
    [~,data_table.Non_Corporates_ID] = ismember(truncated_current_assignee_name(:,1), unique_non_corporates);
    % Assign unique set-of-inventors ID to groups of inventors appearing in
    % truncated sets of data:
    if sum(completed_files) == 0
        unique_set_of_inventors_cumulative{file_ID} = unique(truncated_set_of_inventors);
        unique_set_of_inventors = unique_set_of_inventors_cumulative{file_ID};
    else
        new_unique_set_of_inventors = unique(truncated_set_of_inventors);
        new_unique_set_of_inventors_idx = find(~ismember(new_unique_set_of_inventors, unique_set_of_inventors));
        unique_set_of_inventors_cumulative{file_ID} = new_unique_set_of_inventors(new_unique_set_of_inventors_idx);
        unique_set_of_inventors = [unique_set_of_inventors; unique_set_of_inventors_cumulative{file_ID}];
    end
    [~,data_table.Set_of_Inventors_ID] = ismember(truncated_set_of_inventors(:,1), unique_set_of_inventors);
    % Iterate through patent family records to extract specific patent
    % indicators which may have multiple values stored for the same record:
    for i = 1:rows
        % Extract the application date for each patent family:
        delimited_application_data = strsplit(char(data_table.Application_Data(i)));
        application_date(i,1) = delimited_application_data(2);
        if (strcmp(application_date(i,1), '0') == 1) || (sum(isletter(char(application_date(i,1)))) ~= 0)
            try
                possible_application_year = char(delimited_application_data(3));
                try
                    application_date{i,2} = possible_application_year(2:5);
                catch
                    application_date{i,2} = '0';
                end
            catch
                application_date{i,2} = '0';
            end
        end
        % Extract the priority dates for each patent family:
        delimited_priority_details = strsplit(char(data_table.Priority_Details(i)));
        % Reset and preallocate priority year array for each patent family:
        priority_year_set = zeros(1,size(delimited_priority_details,2));
        % Scan through delimited priority details to identify earliest
        % priority date:
        for j = 1:size(delimited_priority_details,2)
            try
                priority_year_set(j) = year(delimited_priority_details(j));
            catch
                priority_year_set(j) = NaN;
            end
        end
        % Identify the earliest 'Priority Year' from the priority year set
        % for each patent family:
        data_table.Priority_Year(i,1) = min(priority_year_set);
        % Count the number of references cited for each patent family:
        if strcmp(data_table.References(i),'') == 0
            cited_references(i) = size(strfind(char(data_table.References(i)), '<br /><br />'),2);
        else
            cited_references(i) = 0;
        end
        % Count the number of patents cited for each patent family:
        if strcmp(data_table.Cited_Patents(i),'') == 0
            cited_patents(i) = size(strfind(char(data_table.Cited_Patents(i)), '<br /><br />'),2);
        else
            cited_patents(i) = 0;
        end
        % Extract the distinct IPC subclasses associated with each patent
        % family:
        delimited_IPC_subclasses = char(strsplit(char(data_table.Current_IPC(i)), '<br /><br />'));
        % Set table row to zero if no IPC details are provided:
        if isempty(delimited_IPC_subclasses)
            matched_IPC_subclasses(i,:) = array2table(zeros(1, size(IPC_subclasses,1)));
        else
            % Truncate IPC codes to only the first four digits (subclasses)
            % and identify the distinct subclasses referenced by the
            % current patent family:
            truncated_IPC_subclasses = cellstr(delimited_IPC_subclasses(:,1:4));
            unique_IPC_subclasses = unique(truncated_IPC_subclasses);
            % Identify indexes of matched IPC subclasses:
            [~,matched_IPC_subclasses_idx] = ismember(unique_IPC_subclasses, IPC_subclasses);
            % Set corresponding IPC subclass count values to 1:
            try
                matched_IPC_subclasses(i, matched_IPC_subclasses_idx) = array2table(1);
            catch
                % No IPC subclass matches found for this patent family
                % (probably as a result of a typo in the recorded
                % 'Current_IPC' value)
            end
        end
    end
    % Identify valid application date entries:
    missing_application_dates = strcmp(application_date(:,1),'0') + ~cellfun(@isempty,application_date(:,2));
    missing_application_dates_index = find(missing_application_dates ~= 0);
    valid_application_dates = ~missing_application_dates;
   
    % Replace any empty alternative application dates with '0' (so that all
    % values are classed as strings):
    empty_alt_application_dates = cellfun('isempty', application_date(:,2));
    application_date(empty_alt_application_dates,2) = {'0'};
    % Extract the 'Application Year' from the application date:
    data_table.Application_Year(valid_application_dates,1) = year(application_date(valid_application_dates));
    % Identify missing 'Priority Year' entries:
    missing_Priority_Year_index = find(isnan(data_table.Priority_Year) == 1);
   
    % Replace missing 'Priority Year' values with minimum (non-zero where
    % possible) of 'Application Year' and 'Basic Year' values:
    alternative_year_values = [data_table.Basic_Year(missing_Priority_Year_index, 1) data_table.Application_Year(missing_Priority_Year_index, 1) cellfun(@str2num, application_date(missing_Priority_Year_index, 2))];
    alternative_year_values(alternative_year_values == 0) = inf;
    earliest_alternative_year = min(min(alternative_year_values(:,1), alternative_year_values(:,2)), alternative_year_values(:,3));
    earliest_alternative_year(earliest_alternative_year == inf) = 0;
    data_table.Priority_Year(missing_Priority_Year_index,1) = earliest_alternative_year;
   
    % Identify any still outstanding 'Priority Year' values:
    still_missing_Priority_Year_index = find(data_table.Priority_Year == 0);
   
    % Replace missing 'Application Year' values with 'Priority Year'
    % values:
    data_table.Application_Year(missing_application_dates_index,1) = data_table.Priority_Year(missing_application_dates_index,1);
   
    % Initialise the year counter for the timeframe considered in the
    % patent data set (the earliest known US patent is from 1790
    % - removing any years earlier than 1790 will also remove some entries
    % where typos, etc. have given incorrect data entries):
    time_period = (min(min(data_table.Priority_Year(data_table.Priority_Year >= 1790)),...
        min(data_table.Application_Year(data_table.Application_Year >= 1790))):...
        max(max(data_table.Priority_Year(data_table.Priority_Year <= 2020)),...
        max(data_table.Application_Year(data_table.Application_Year <= 2020))))';
   
    % Check if the time period considered in the current file is outside of
    % the existing time limits established from previous files, and update
    % global time limits as necessary:
    if upper_time_limit == inf
        upper_time_limit = max(time_period);
    elseif max(time_period) > upper_time_limit
        upper_time_limit = max(time_period);
    end
    if lower_time_limit == -inf
        lower_time_limit = min(time_period);
    elseif min(time_period) < lower_time_limit
        lower_time_limit = min(time_period);
    end
   
    % Preallocate empty arrays for derived metrics:
    corporates_by_priority_year_values = zeros(size(time_period,1),1);
    non_corporates_by_priority_year_values = zeros(size(time_period,1),1);
    set_of_inventors_by_priority_year_values = zeros(size(time_period,1),1);
    cited_references_by_priority_year_values = zeros(size(time_period,1),1);
    cited_patents_by_priority_year_values = zeros(size(time_period,1),1);
    IPC_subclass_frequency_by_priority_year_values = zeros(size(time_period,1), size(IPC_subclasses,1));
    distinct_IPC_subclass_by_priority_year_values = zeros(size(time_period,1),1);
    top_5_IPC_subclass_patents_by_priority_year_values = zeros(size(time_period,1),1);
    top_10_IPC_subclass_patents_by_priority_year_values = zeros(size(time_period,1),1);
    % Cycle through timeframe considered in patents to calculate yearly
    % totals:
    for i = 1:(max(time_period) - min(time_period))
        % Identify the indices of patent families recorded in the current
        % priority year of interest:
        idx = find(data_table.Priority_Year == time_period(i));
        % Count the number of unique corporate IDs associated with patents
        % in any given year (and remove blanks which have ID '1'):
        unique_corp_ID_by_year = unique(data_table.Corporation_ID(idx));
        unique_corp_ID_by_year_no_blanks = unique_corp_ID_by_year(unique_corp_ID_by_year ~= 1);
        corporates_by_priority_year_values(i) = size(unique_corp_ID_by_year_no_blanks,1);
        % Count the number of unique non-corporate IDs associated with
        % patents in any given year (and remove blanks which have ID '1',
        % and corporations which have ID '0'):
        unique_non_corp_ID_by_year = unique(data_table.Non_Corporates_ID(idx));
        unique_non_corp_ID_by_year_no_blanks = unique_non_corp_ID_by_year((unique_non_corp_ID_by_year ~= 0) & (unique_non_corp_ID_by_year ~= 1));
        non_corporates_by_priority_year_values(i) = size(unique_non_corp_ID_by_year_no_blanks,1);
        % Count the number of unique set of inventors IDs associated with
        % patents in any given year (and remove blanks which have ID '1'):
        unique_set_of_inventors_ID_by_year = unique(data_table.Set_of_Inventors_ID(idx));
        unique_set_of_inventors_ID_by_year_no_blanks = unique_set_of_inventors_ID_by_year( unique_set_of_inventors_ID_by_year ~= 1);
        set_of_inventors_by_priority_year_values(i) = size(unique_set_of_inventors_ID_by_year_no_blanks,1);
        % Count the total number of cited references associated with
        % patents in any given year:
        cited_references_by_priority_year_values(i) = sum(cited_references(idx));
        % Count the total number of cited patents associated with patents
        % in any given year:
        cited_patents_by_priority_year_values(i) = sum(cited_patents(idx));
        % Count the frequency of IPC subclasses being associated with
        % patents in any given year:
        IPC_subclass_frequency_by_priority_year_values(i,:) = sum(table2array(matched_IPC_subclasses(idx,:)),1);
        % Count the number of distinct IPC subclasses that are recorded as
        % being associated with patents in any given year:
        distinct_IPC_subclass_by_priority_year_values(i) = sum(IPC_subclass_frequency_by_priority_year_values(i,:) ~= 0);
        % Rank the IPC subclasses according to the number of associated
        % patent families for any given year:
        [max_IPC_subclass_counts,max_IPC_subclass_counts_idx] = sort(IPC_subclass_frequency_by_priority_year_values(i,:), 'descend');
        % Extract the aggregate count of the number of patent families
        % associated with the top 5 and top 10 IPC subclasses for any given
        % year:
        top_5_IPC_subclass_patents_by_priority_year_values(i) = sum(max_IPC_subclass_counts(1:5));
        top_10_IPC_subclass_patents_by_priority_year_values(i) = sum(max_IPC_subclass_counts(1:10));
    end
    % Build pivot and summary tables:
    patents_by_application_year{file_ID} = pivot_table(data_table, 'Application_Year','Pivot_Table_Counter',@sum);
    patents_by_priority_year{file_ID} = pivot_table(data_table, 'Priority_Year','Pivot_Table_Counter',@sum);
    corporates_by_priority_year{file_ID} = table(time_period, corporates_by_priority_year_values);
    non_corporates_by_priority_year{file_ID} = table(time_period, non_corporates_by_priority_year_values);
    set_of_inventors_by_priority_year{file_ID} = table(time_period, set_of_inventors_by_priority_year_values);
    cited_references_by_priority_year{file_ID} = table(time_period, cited_references_by_priority_year_values);
    cited_patents_by_priority_year{file_ID} = table(time_period, cited_patents_by_priority_year_values);
    IPC_subclass_frequency_by_priority_year{file_ID} = [table(time_period) array2table(IPC_subclass_frequency_by_priority_year_values, 'VariableNames', IPC_subclasses)];
    distinct_IPC_subclass_by_priority_year{file_ID} = table(time_period, distinct_IPC_subclass_by_priority_year_values);
    top_5_IPC_subclass_patents_by_priority_year{file_ID} = table(time_period, top_5_IPC_subclass_patents_by_priority_year_values);
    top_10_IPC_subclass_patents_by_priority_year{file_ID} = table(time_period, top_10_IPC_subclass_patents_by_priority_year_values);
   
    % Append the current file ID to the 'completed files' tracker:
    completed_files(file_ID) = 1;
   
    toc()
end
% Identify indices of completed files:
completed_files_idx = find(completed_files ~= 0);
% Build global time period table:
global_time_period = (lower_time_limit:upper_time_limit)';
%% Create structure containing nested output tables:
summary_tables = struct('Names', table_names', 'Tables', {patents_by_application_year, patents_by_priority_year,...
    corporates_by_priority_year, non_corporates_by_priority_year,...
    set_of_inventors_by_priority_year, cited_references_by_priority_year,...
    cited_patents_by_priority_year, distinct_IPC_subclass_by_priority_year,...
    top_5_IPC_subclass_patents_by_priority_year, top_10_IPC_subclass_patents_by_priority_year},...
    'Final_Tables',cell(1,numel(table_names)));
for i = 1:sum(completed_files)
    % Select completed file ID:
    file_ID = completed_files_idx(i);
   
    % Iterate through summary tables corresponding to each completed file:
    for j = 1:numel(table_names)
        % Expand summary tables as necessary to include years with zero
        % records:
        missing_years_idx = ~ismember(global_time_period, table2array(summary_tables(j).Tables{1, file_ID}(:,1)));
        missing_years = missing_years_idx .* global_time_period;
        missing_years = missing_years(missing_years ~= 0);
        missing_entries = array2table([missing_years zeros(size(missing_years,1),1)], 'VariableNames', eval(strcat(table_names{j}, '{file_ID}.Properties.VariableNames')));
        current_table = sortrows([summary_tables(j).Tables{1,file_ID}; missing_entries]);
       
        % Remove any years earlier than 1790 (which will also remove some
        % entries where typos, etc. have given incorrect data entries, such
        % as 'zero year' summary counts from tables):
        toDelete = logical((table2array(current_table(:,1)) < 1790) + (table2array(current_table(:,1)) > 2020));
        current_table(toDelete,:) = [];
        summary_tables(j).Tables{1,file_ID} = current_table;
    end   
end
% Combine summary tables into one overall patent indicator matrix:
for j = 1:numel(table_names)
    % Initialise or reset cumulative count for each table type:
    cumulative_count = zeros(size(global_time_period,1), sum(completed_files));
   
    % Append counts in completed tables into one summary vector for the
    % current table type:
    for i = 1:sum(completed_files)
        % Select completed file ID:
        file_ID = completed_files_idx(i);
       
        % Populate cumulative count array with the summary results from
        % each file:
        cumulative_count(:,i) = table2array(summary_tables(j).Tables{1, file_ID}(:,2));       
    end
   
    % Sum cumulative file counts to get the overall count:
    overall_count = sum(cumulative_count,2);
   
    % Send overall count back to summary tables:
    summary_tables(j).Final_Tables = [global_time_period overall_count];
end
% Save the current cell array to a .MAT file in the current working
% directory:
savefile = 'Summary development trends from patent data.mat';
save(savefile, 'summary_tables');
% Determine screensize so that figures are scaled to the right size for the
% current monitor (scrsz == screen size vector [left, bottom, width,
% height])
scrsz = get(0,'ScreenSize');
% Generate a new figure showing the development trends of all patent
% indicators considered:
figure('Name',['Development trends of patent indicators for ', lower(deepestFolder)], 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
hold on
% Plot all patent indicators on the same figure:
for j = 1:numel(table_names)
    plot(summary_tables(j).Final_Tables(:,1), summary_tables(j).Final_Tables(:,2))
end
% Set figure title, x-axis, and y-axis labels:
title(['Development trends of patent indicators for ', lower(deepestFolder)], 'FontSize',14);
xlabel('Year','FontSize',12);
ylabel('Count','FontSize',12);
% Add legend to figure for development trends:
legend(strrep(table_names,'_',' '), 'FontSize',12,'Location','northwest');
% Save the current figure to a .FIG file in the current working directory:
savefig(['Development trends of patent indicators for ', lower(deepestFolder),'.fig'])
toc()