Script 2: extract_patent_metrics.m
% This script is used to clean patent data files (already imported from
% HTML files using the 'batch_html_conversion.m' script and saved as .MAT
% files) and extract patent metrics required for use in subsequent 'nearest
% neighbour' pattern recognition analysis.
clearvars
clear all
close all
tic()
% Request the user to choose the current working directory (NB: if no
% folder is selected in the user interface then the current working
% directory is taken to be the filepath) OR do not ask the user and just
% take the current directory:
% filepath = uigetdir;
filepath = pwd;
% Identify the name of the selected working directory:
[upperPath,deepestFolder] = fileparts(filepath);
% Identify .MAT files in the selected working directory that match the name
% of the folder:
files = dir(strcat(deepestFolder,'*.mat'));
% Load list of International Patent Classification (IPC) subclasses:
load('IPC_subclasses.mat','IPC_subclasses')
% Initialise upper and lower global time limits:
upper_time_limit = inf;
lower_time_limit = -inf;
% Initialise the 'completed files' tracker:
completed_files = zeros(size(files,1),1);
% Preallocate empty cell arrays for building cumulative lists of unique
% corporate, non-corporate, and set of inventor entries:
unique_corporations_cumulative = cell(1,size(files,1));
unique_non_corporates_cumulative = cell(1,size(files,1));
unique_set_of_inventors_cumulative = cell(1,size(files,1));
% Preallocate empty cell arrays for compiling file summary counts:
patents_by_application_year = cell(1,size(files,1));
patents_by_priority_year = cell(1,size(files,1));
corporates_by_priority_year = cell(1,size(files,1));
non_corporates_by_priority_year = cell(1,size(files,1));
set_of_inventors_by_priority_year = cell(1,size(files,1));
cited_references_by_priority_year = cell(1,size(files,1));
cited_patents_by_priority_year = cell(1,size(files,1));
IPC_subclass_frequency_by_priority_year = cell(1,size(files,1));
distinct_IPC_subclass_by_priority_year = cell(1,size(files,1));
top_5_IPC_subclass_patents_by_priority_year = cell(1,size(files,1));
top_10_IPC_subclass_patents_by_priority_year = cell(1,size(files,1));
% Create array of table names:
table_names = {'patents_by_application_year'; 'patents_by_priority_year';...
'corporates_by_priority_year'; 'non_corporates_by_priority_year';...
'set_of_inventors_by_priority_year'; 'cited_references_by_priority_year';...
'cited_patents_by_priority_year'; 'distinct_IPC_subclass_by_priority_year';...
'top_5_IPC_subclass_patents_by_priority_year'; 'top_10_IPC_subclass_patents_by_priority_year'};
%% Iterate through input .MAT files in the current folder directory:
for file_ID = 1:numel(files)
% for file_ID = 2:4
% for file_ID = 2:2
% for file_ID = 16:16
% for file_ID = 22:22
% Select the next .MAT file to convert:
name = files(file_ID).name;
% Load .MAT file containing patent data:
load(name, 'cell_array')
% Select cell array containing the patent data to evaluate:
data_set = 'cell_array';
% Create table object from cell array data set:
data_table = cell2table(eval(strcat(data_set, '(2:end,:)')));
% Replace any ' ' values with a blank table cell:
data_table = standardizeMissing(data_table,' ');
% Determine the number of rows in the data table (not including
% headers):
rows = size(data_table,1);
% Read in table headings as stored in cell array:
headings = eval(strcat(data_set, '(1,:)'));
% Remove any leading and trailing white spaces from headings:
headings = strtrim(headings);
% Remove or replace any remaining white spaces, slashes, brackets, full
% stops, or hyphens in headings with underscores:
headings = strrep(headings,' ','_');
headings = strrep(headings,'/','_');
headings = strrep(headings,'(','');
headings = strrep(headings,')','');
headings = strrep(headings,'.','');
headings = strrep(headings,'-','_');
% Identify headings that are unique (in the same order that they
% currently appear in):
[~,unique_headings_index,~] = unique(headings,'stable');
unique_headings = ismember(1:size(headings,2),unique_headings_index);
for i = 1:size(headings,2)
if unique_headings(i) == 0
headings(i) = strcat(headings(i),'_',num2str(i));
end
end
% Assign headings to table variables:
data_table.Properties.VariableNames = headings;
% Add pivot table counter column to table:
data_table.Pivot_Table_Counter = ones(rows,1);
% Identify valid date entries:
missing_dates = strcmp(data_table.Date,'0');
missing_dates_index = find(missing_dates == 1);
valid_dates = ~missing_dates;
% Extract 'Basic Year' for each patent family:
data_table.Basic_Year(valid_dates,1) = year(data_table.Date(valid_dates));
% Convert required cell arrays into character arrays:
current_assignee_name = char(data_table.Current_Applicant__or_Assignee_Name);
family_normalized_assignee_name = char(data_table.Family_Normalized_Assignee_name);
set_of_inventors = char(data_table.Inventor);
% Define string truncation limits:
char_limit_1 = 12;
char_limit_2 = 13;
% Preallocate empty arrays and tables for derived metrics:
application_date = cell(rows,2);
cited_references = zeros(rows,1);
cited_patents = zeros(rows,1);
matched_IPC_subclasses = array2table(zeros(rows,size(IPC_subclasses,1)),'VariableNames', IPC_subclasses);
% Truncate data entries in 'Current Applicant or Assignee Name',
% 'Family Normalized Assignee name', and 'Inventor' columns in order to
% improve identification of unique entries:
truncated_current_assignee_name = cellstr(current_assignee_name(:,1:char_limit_1));
truncated_family_normalized_assignee_name = cellstr(family_normalized_assignee_name(:,1:char_limit_2));
truncated_set_of_inventors = cellstr(set_of_inventors(:,1:char_limit_1));
% Assign unique corporation ID to companies appearing in truncated sets
% of data:
if sum(completed_files) == 0
unique_corporations_cumulative{file_ID} = unique(truncated_family_normalized_assignee_name);
unique_corporations = unique_corporations_cumulative{file_ID};
else
new_unique_corporations = unique(truncated_family_normalized_assignee_name);
new_unique_corporations_idx = find(~ismember(new_unique_corporations,unique_corporations));
unique_corporations_cumulative{file_ID} = new_unique_corporations(new_unique_corporations_idx);
unique_corporations = [unique_corporations;unique_corporations_cumulative{file_ID}];
end
[~,data_table.Corporation_ID] = ismember(truncated_family_normalized_assignee_name(:,1), unique_corporations);
% Assign unique non-corporate ID to patents that are not associated
% with a recognised company name appearing in the truncated sets of
% data:
non_corporates_idx = find(strcmp(truncated_family_normalized_assignee_name, '') == 1);
non_corporates = truncated_current_assignee_name(non_corporates_idx);
if sum(completed_files) == 0
unique_non_corporates_cumulative{file_ID} = unique(non_corporates);
unique_non_corporates = unique_non_corporates_cumulative{file_ID};
else
new_unique_non_corporates = unique(non_corporates);
new_unique_non_corporates_idx = find(~ismember(new_unique_non_corporates, unique_non_corporates));
unique_non_corporates_cumulative{file_ID} = new_unique_non_corporates(new_unique_non_corporates_idx);
unique_non_corporates = [unique_non_corporates; unique_non_corporates_cumulative{file_ID}];
end
[~,data_table.Non_Corporates_ID] = ismember(truncated_current_assignee_name(:,1), unique_non_corporates);
% Assign unique set-of-inventors ID to groups of inventors appearing in
% truncated sets of data:
if sum(completed_files) == 0
unique_set_of_inventors_cumulative{file_ID} = unique(truncated_set_of_inventors);
unique_set_of_inventors = unique_set_of_inventors_cumulative{file_ID};
else
new_unique_set_of_inventors = unique(truncated_set_of_inventors);
new_unique_set_of_inventors_idx = find(~ismember(new_unique_set_of_inventors, unique_set_of_inventors));
unique_set_of_inventors_cumulative{file_ID} = new_unique_set_of_inventors(new_unique_set_of_inventors_idx);
unique_set_of_inventors = [unique_set_of_inventors; unique_set_of_inventors_cumulative{file_ID}];
end
[~,data_table.Set_of_Inventors_ID] = ismember(truncated_set_of_inventors(:,1), unique_set_of_inventors);
% Iterate through patent family records to extract specific patent
% indicators which may have multiple values stored for the same record:
for i = 1:rows
% Extract the application date for each patent family:
delimited_application_data = strsplit(char(data_table.Application_Data(i)));
application_date(i,1) = delimited_application_data(2);
if (strcmp(application_date(i,1), '0') == 1) || (sum(isletter(char(application_date(i,1)))) ~= 0)
try
possible_application_year = char(delimited_application_data(3));
try
application_date{i,2} = possible_application_year(2:5);
catch
application_date{i,2} = '0';
end
catch
application_date{i,2} = '0';
end
end
% Extract the priority dates for each patent family:
delimited_priority_details = strsplit(char(data_table.Priority_Details(i)));
% Reset and preallocate priority year array for each patent family:
priority_year_set = zeros(1,size(delimited_priority_details,2));
% Scan through delimited priority details to identify earliest
% priority date:
for j = 1:size(delimited_priority_details,2)
try
priority_year_set(j) = year(delimited_priority_details(j));
catch
priority_year_set(j) = NaN;
end
end
% Identify the earliest 'Priority Year' from the priority year set
% for each patent family:
data_table.Priority_Year(i,1) = min(priority_year_set);
% Count the number of references cited for each patent family:
if strcmp(data_table.References(i),'') == 0
cited_references(i) = size(strfind(char(data_table.References(i)), '<br /><br />'),2);
else
cited_references(i) = 0;
end
% Count the number of patents cited for each patent family:
if strcmp(data_table.Cited_Patents(i),'') == 0
cited_patents(i) = size(strfind(char(data_table.Cited_Patents(i)), '<br /><br />'),2);
else
cited_patents(i) = 0;
end
% Extract the distinct IPC subclasses associated with each patent
% family:
delimited_IPC_subclasses = char(strsplit(char(data_table.Current_IPC(i)), '<br /><br />'));
% Set table row to zero if no IPC details are provided:
if isempty(delimited_IPC_subclasses)
matched_IPC_subclasses(i,:) = array2table(zeros(1, size(IPC_subclasses,1)));
else
% Truncate IPC codes to only the first four digits (subclasses)
% and identify the distinct subclasses referenced by the
% current patent family:
truncated_IPC_subclasses = cellstr(delimited_IPC_subclasses(:,1:4));
unique_IPC_subclasses = unique(truncated_IPC_subclasses);
% Identify indexes of matched IPC subclasses:
[~,matched_IPC_subclasses_idx] = ismember(unique_IPC_subclasses, IPC_subclasses);
% Set corresponding IPC subclass count values to 1:
try
matched_IPC_subclasses(i, matched_IPC_subclasses_idx) = array2table(1);
catch
% No IPC subclass matches found for this patent family
% (probably as a result of a typo in the recorded
% 'Current_IPC' value)
end
end
end
% Identify valid application date entries:
missing_application_dates = strcmp(application_date(:,1),'0') + ~cellfun(@isempty,application_date(:,2));
missing_application_dates_index = find(missing_application_dates ~= 0);
valid_application_dates = ~missing_application_dates;
% Replace any empty alternative application dates with '0' (so that all
% values are classed as strings):
empty_alt_application_dates = cellfun('isempty', application_date(:,2));
application_date(empty_alt_application_dates,2) = {'0'};
% Extract the 'Application Year' from the application date:
data_table.Application_Year(valid_application_dates,1) = year(application_date(valid_application_dates));
% Identify missing 'Priority Year' entries:
missing_Priority_Year_index = find(isnan(data_table.Priority_Year) == 1);
% Replace missing 'Priority Year' values with minimum (non-zero where
% possible) of 'Application Year' and 'Basic Year' values:
alternative_year_values = [data_table.Basic_Year(missing_Priority_Year_index, 1) data_table.Application_Year(missing_Priority_Year_index, 1) cellfun(@str2num, application_date(missing_Priority_Year_index, 2))];
alternative_year_values(alternative_year_values == 0) = inf;
earliest_alternative_year = min(min(alternative_year_values(:,1), alternative_year_values(:,2)), alternative_year_values(:,3));
earliest_alternative_year(earliest_alternative_year == inf) = 0;
data_table.Priority_Year(missing_Priority_Year_index,1) = earliest_alternative_year;
% Identify any still outstanding 'Priority Year' values:
still_missing_Priority_Year_index = find(data_table.Priority_Year == 0);
% Replace missing 'Application Year' values with 'Priority Year'
% values:
data_table.Application_Year(missing_application_dates_index,1) = data_table.Priority_Year(missing_application_dates_index,1);
% Initialise the year counter for the timeframe considered in the
% patent data set (the earliest known US patent is from 1790
% - removing any years earlier than 1790 will also remove some entries
% where typos, etc. have given incorrect data entries):
time_period = (min(min(data_table.Priority_Year(data_table.Priority_Year >= 1790)),...
min(data_table.Application_Year(data_table.Application_Year >= 1790))):...
max(max(data_table.Priority_Year(data_table.Priority_Year <= 2020)),...
max(data_table.Application_Year(data_table.Application_Year <= 2020))))';
% Check if the time period considered in the current file is outside of
% the existing time limits established from previous files, and update
% global time limits as necessary:
if upper_time_limit == inf
upper_time_limit = max(time_period);
elseif max(time_period) > upper_time_limit
upper_time_limit = max(time_period);
end
if lower_time_limit == -inf
lower_time_limit = min(time_period);
elseif min(time_period) < lower_time_limit
lower_time_limit = min(time_period);
end
% Preallocate empty arrays for derived metrics:
corporates_by_priority_year_values = zeros(size(time_period,1),1);
non_corporates_by_priority_year_values = zeros(size(time_period,1),1);
set_of_inventors_by_priority_year_values = zeros(size(time_period,1),1);
cited_references_by_priority_year_values = zeros(size(time_period,1),1);
cited_patents_by_priority_year_values = zeros(size(time_period,1),1);
IPC_subclass_frequency_by_priority_year_values = zeros(size(time_period,1), size(IPC_subclasses,1));
distinct_IPC_subclass_by_priority_year_values = zeros(size(time_period,1),1);
top_5_IPC_subclass_patents_by_priority_year_values = zeros(size(time_period,1),1);
top_10_IPC_subclass_patents_by_priority_year_values = zeros(size(time_period,1),1);
% Cycle through timeframe considered in patents to calculate yearly
% totals:
for i = 1:(max(time_period) - min(time_period))
% Identify the indices of patent families recorded in the current
% priority year of interest:
idx = find(data_table.Priority_Year == time_period(i));
% Count the number of unique corporate IDs associated with patents
% in any given year (and remove blanks which have ID '1'):
unique_corp_ID_by_year = unique(data_table.Corporation_ID(idx));
unique_corp_ID_by_year_no_blanks = unique_corp_ID_by_year(unique_corp_ID_by_year ~= 1);
corporates_by_priority_year_values(i) = size(unique_corp_ID_by_year_no_blanks,1);
% Count the number of unique non-corporate IDs associated with
% patents in any given year (and remove blanks which have ID '1',
% and corporations which have ID '0'):
unique_non_corp_ID_by_year = unique(data_table.Non_Corporates_ID(idx));
unique_non_corp_ID_by_year_no_blanks = unique_non_corp_ID_by_year((unique_non_corp_ID_by_year ~= 0) & (unique_non_corp_ID_by_year ~= 1));
non_corporates_by_priority_year_values(i) = size(unique_non_corp_ID_by_year_no_blanks,1);
% Count the number of unique set of inventors IDs associated with
% patents in any given year (and remove blanks which have ID '1'):
unique_set_of_inventors_ID_by_year = unique(data_table.Set_of_Inventors_ID(idx));
unique_set_of_inventors_ID_by_year_no_blanks = unique_set_of_inventors_ID_by_year( unique_set_of_inventors_ID_by_year ~= 1);
set_of_inventors_by_priority_year_values(i) = size(unique_set_of_inventors_ID_by_year_no_blanks,1);
% Count the total number of cited references associated with
% patents in any given year:
cited_references_by_priority_year_values(i) = sum(cited_references(idx));
% Count the total number of cited patents associated with patents
% in any given year:
cited_patents_by_priority_year_values(i) = sum(cited_patents(idx));
% Count the frequency of IPC subclasses being associated with
% patents in any given year:
IPC_subclass_frequency_by_priority_year_values(i,:) = sum(table2array(matched_IPC_subclasses(idx,:)),1);
% Count the number of distinct IPC subclasses that are recorded as
% being associated with patents in any given year:
distinct_IPC_subclass_by_priority_year_values(i) = sum(IPC_subclass_frequency_by_priority_year_values(i,:) ~= 0);
% Rank the IPC subclasses according to the number of associated
% patent families for any given year:
[max_IPC_subclass_counts,max_IPC_subclass_counts_idx] = sort(IPC_subclass_frequency_by_priority_year_values(i,:), 'descend');
% Extract the aggregate count of the number of patent families
% associated with the top 5 and top 10 IPC subclasses for any given
% year:
top_5_IPC_subclass_patents_by_priority_year_values(i) = sum(max_IPC_subclass_counts(1:5));
top_10_IPC_subclass_patents_by_priority_year_values(i) = sum(max_IPC_subclass_counts(1:10));
end
% Build pivot and summary tables:
patents_by_application_year{file_ID} = pivot_table(data_table, 'Application_Year','Pivot_Table_Counter',@sum);
patents_by_priority_year{file_ID} = pivot_table(data_table, 'Priority_Year','Pivot_Table_Counter',@sum);
corporates_by_priority_year{file_ID} = table(time_period, corporates_by_priority_year_values);
non_corporates_by_priority_year{file_ID} = table(time_period, non_corporates_by_priority_year_values);
set_of_inventors_by_priority_year{file_ID} = table(time_period, set_of_inventors_by_priority_year_values);
cited_references_by_priority_year{file_ID} = table(time_period, cited_references_by_priority_year_values);
cited_patents_by_priority_year{file_ID} = table(time_period, cited_patents_by_priority_year_values);
IPC_subclass_frequency_by_priority_year{file_ID} = [table(time_period) array2table(IPC_subclass_frequency_by_priority_year_values, 'VariableNames', IPC_subclasses)];
distinct_IPC_subclass_by_priority_year{file_ID} = table(time_period, distinct_IPC_subclass_by_priority_year_values);
top_5_IPC_subclass_patents_by_priority_year{file_ID} = table(time_period, top_5_IPC_subclass_patents_by_priority_year_values);
top_10_IPC_subclass_patents_by_priority_year{file_ID} = table(time_period, top_10_IPC_subclass_patents_by_priority_year_values);
% Append the current file ID to the 'completed files' tracker:
completed_files(file_ID) = 1;
toc()
end
% Identify indices of completed files:
completed_files_idx = find(completed_files ~= 0);
% Build global time period table:
global_time_period = (lower_time_limit:upper_time_limit)';
%% Create structure containing nested output tables:
summary_tables = struct('Names', table_names', 'Tables', {patents_by_application_year, patents_by_priority_year,...
corporates_by_priority_year, non_corporates_by_priority_year,...
set_of_inventors_by_priority_year, cited_references_by_priority_year,...
cited_patents_by_priority_year, distinct_IPC_subclass_by_priority_year,...
top_5_IPC_subclass_patents_by_priority_year, top_10_IPC_subclass_patents_by_priority_year},...
'Final_Tables',cell(1,numel(table_names)));
for i = 1:sum(completed_files)
% Select completed file ID:
file_ID = completed_files_idx(i);
% Iterate through summary tables corresponding to each completed file:
for j = 1:numel(table_names)
% Expand summary tables as necessary to include years with zero
% records:
missing_years_idx = ~ismember(global_time_period, table2array(summary_tables(j).Tables{1, file_ID}(:,1)));
missing_years = missing_years_idx .* global_time_period;
missing_years = missing_years(missing_years ~= 0);
missing_entries = array2table([missing_years zeros(size(missing_years,1),1)], 'VariableNames', eval(strcat(table_names{j}, '{file_ID}.Properties.VariableNames')));
current_table = sortrows([summary_tables(j).Tables{1,file_ID}; missing_entries]);
% Remove any years earlier than 1790 (which will also remove some
% entries where typos, etc. have given incorrect data entries, such
% as 'zero year' summary counts from tables):
toDelete = logical((table2array(current_table(:,1)) < 1790) + (table2array(current_table(:,1)) > 2020));
current_table(toDelete,:) = [];
summary_tables(j).Tables{1,file_ID} = current_table;
end
end
% Combine summary tables into one overall patent indicator matrix:
for j = 1:numel(table_names)
% Initialise or reset cumulative count for each table type:
cumulative_count = zeros(size(global_time_period,1), sum(completed_files));
% Append counts in completed tables into one summary vector for the
% current table type:
for i = 1:sum(completed_files)
% Select completed file ID:
file_ID = completed_files_idx(i);
% Populate cumulative count array with the summary results from
% each file:
cumulative_count(:,i) = table2array(summary_tables(j).Tables{1, file_ID}(:,2));
end
% Sum cumulative file counts to get the overall count:
overall_count = sum(cumulative_count,2);
% Send overall count back to summary tables:
summary_tables(j).Final_Tables = [global_time_period overall_count];
end
% Save the current cell array to a .MAT file in the current working
% directory:
savefile = 'Summary development trends from patent data.mat';
save(savefile, 'summary_tables');
% Determine screensize so that figures are scaled to the right size for the
% current monitor (scrsz == screen size vector [left, bottom, width,
% height])
scrsz = get(0,'ScreenSize');
% Generate a new figure showing the development trends of all patent
% indicators considered:
figure('Name',['Development trends of patent indicators for ', lower(deepestFolder)], 'NumberTitle', 'off', 'OuterPosition', [1, 1, scrsz(3), scrsz(4)]);
hold on
% Plot all patent indicators on the same figure:
for j = 1:numel(table_names)
plot(summary_tables(j).Final_Tables(:,1), summary_tables(j).Final_Tables(:,2))
end
% Set figure title, x-axis, and y-axis labels:
title(['Development trends of patent indicators for ', lower(deepestFolder)], 'FontSize',14);
xlabel('Year','FontSize',12);
ylabel('Count','FontSize',12);
% Add legend to figure for development trends:
legend(strrep(table_names,'_',' '), 'FontSize',12,'Location','northwest');
% Save the current figure to a .FIG file in the current working directory:
savefig(['Development trends of patent indicators for ', lower(deepestFolder),'.fig'])
toc()