Spaces:
Sleeping
Sleeping
Update patentwiz/preprocess_data.py
Browse files- patentwiz/preprocess_data.py +12 -5
patentwiz/preprocess_data.py
CHANGED
@@ -86,7 +86,7 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
86 |
"""
|
87 |
Filters patents based on keywords and specified fields.
|
88 |
Parameters:
|
89 |
-
patents (list): List of
|
90 |
keywords (list): Keywords to filter patents.
|
91 |
fields (list): Fields to search for keywords (e.g., Title, Abstract, Claims).
|
92 |
Returns:
|
@@ -99,14 +99,21 @@ def filter_rf_patents(patents, keywords=None, fields=None):
|
|
99 |
|
100 |
filtered_patents = []
|
101 |
for patent in patents:
|
102 |
-
for
|
103 |
-
|
104 |
-
if any(keyword.lower() in
|
105 |
filtered_patents.append(patent)
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
return filtered_patents
|
108 |
|
109 |
|
|
|
110 |
def extract_patents(year, month, day, logging):
|
111 |
"""
|
112 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|
|
|
86 |
"""
|
87 |
Filters patents based on keywords and specified fields.
|
88 |
Parameters:
|
89 |
+
patents (list): List of patent texts (as strings or structured data).
|
90 |
keywords (list): Keywords to filter patents.
|
91 |
fields (list): Fields to search for keywords (e.g., Title, Abstract, Claims).
|
92 |
Returns:
|
|
|
99 |
|
100 |
filtered_patents = []
|
101 |
for patent in patents:
|
102 |
+
# If patent is a string, search for keywords in the entire text
|
103 |
+
if isinstance(patent, str):
|
104 |
+
if any(keyword.lower() in patent.lower() for keyword in keywords):
|
105 |
filtered_patents.append(patent)
|
106 |
+
# If patent is structured (e.g., dictionary), search within fields
|
107 |
+
elif isinstance(patent, dict):
|
108 |
+
for field in fields:
|
109 |
+
field_content = patent.get(field.lower(), "")
|
110 |
+
if any(keyword.lower() in field_content.lower() for keyword in keywords):
|
111 |
+
filtered_patents.append(patent)
|
112 |
+
break
|
113 |
return filtered_patents
|
114 |
|
115 |
|
116 |
+
|
117 |
def extract_patents(year, month, day, logging):
|
118 |
"""
|
119 |
This function reads a patent file in XML format, splits it into individual patents, parses each
|