DrishtiSharma commited on
Commit
2e9acfd
·
verified ·
1 Parent(s): d598d5a

Update patentwiz/preprocess_data.py

Browse files
Files changed (1) hide show
  1. patentwiz/preprocess_data.py +12 -5
patentwiz/preprocess_data.py CHANGED
@@ -86,7 +86,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
86
  """
87
  Filters patents based on keywords and specified fields.
88
  Parameters:
89
- patents (list): List of patents.
90
  keywords (list): Keywords to filter patents.
91
  fields (list): Fields to search for keywords (e.g., Title, Abstract, Claims).
92
  Returns:
@@ -99,14 +99,21 @@ def filter_rf_patents(patents, keywords=None, fields=None):
99
 
100
  filtered_patents = []
101
  for patent in patents:
102
- for field in fields:
103
- field_content = patent.get(field.lower(), "")
104
- if any(keyword.lower() in field_content.lower() for keyword in keywords):
105
  filtered_patents.append(patent)
106
- break
 
 
 
 
 
 
107
  return filtered_patents
108
 
109
 
 
110
  def extract_patents(year, month, day, logging):
111
  """
112
  This function reads a patent file in XML format, splits it into individual patents, parses each
 
86
  """
87
  Filters patents based on keywords and specified fields.
88
  Parameters:
89
+ patents (list): List of patent texts (as strings or structured data).
90
  keywords (list): Keywords to filter patents.
91
  fields (list): Fields to search for keywords (e.g., Title, Abstract, Claims).
92
  Returns:
 
99
 
100
  filtered_patents = []
101
  for patent in patents:
102
+ # If patent is a string, search for keywords in the entire text
103
+ if isinstance(patent, str):
104
+ if any(keyword.lower() in patent.lower() for keyword in keywords):
105
  filtered_patents.append(patent)
106
+ # If patent is structured (e.g., dictionary), search within fields
107
+ elif isinstance(patent, dict):
108
+ for field in fields:
109
+ field_content = patent.get(field.lower(), "")
110
+ if any(keyword.lower() in field_content.lower() for keyword in keywords):
111
+ filtered_patents.append(patent)
112
+ break
113
  return filtered_patents
114
 
115
 
116
+
117
  def extract_patents(year, month, day, logging):
118
  """
119
  This function reads a patent file in XML format, splits it into individual patents, parses each