Skip to content

App Module

PDFProcessingError

Bases: Exception

Custom exception for PDF processing errors.

Source code in backend\processing.py
49
50
51
class PDFProcessingError(Exception):
    """Custom exception for PDF processing errors."""
    pass

construct_search_query(selected_keywords)

Construct a search query string using logical AND.

Source code in backend\processing.py
222
223
224
225
226
def construct_search_query(selected_keywords):
    """
    Construct a search query string using logical AND.
    """
    return ' AND '.join(selected_keywords)

execute_search_scopus(query, scopus_api_key, threshold=1000)

Execute the search query using the Scopus API via elsapy.

Parameters: - query: Search query string. - scopus_api_key: Dict containing 'apikey' and 'insttoken'.

Returns: - match_count: Number of matching articles. - matched_papers: Set of unique paper identifiers (e.g., Links).

Source code in backend\processing.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def execute_search_scopus(query, scopus_api_key, threshold=1000):
    """
    Execute the search query using the Scopus API via elsapy.

    Parameters:
    - query: Search query string.
    - scopus_api_key: Dict containing 'apikey' and 'insttoken'.

    Returns:
    - match_count: Number of matching articles.
    - matched_papers: Set of unique paper identifiers (e.g., Links).
    """
    headers = {
        'X-ELS-APIKey': scopus_api_key['apikey'],
        'X-ELS-Insttoken': scopus_api_key['insttoken']
    }

    client = ElsClient(scopus_api_key['apikey'])
    client.inst_token = scopus_api_key['insttoken']

    search_query = f'TITLE-ABS-KEY({query})'
    doc_srch = ElsSearch(search_query, 'scopus')

    try:
        doc_srch.execute(client, get_all=False)
        num_results = doc_srch.tot_num_res

        if num_results > 0 and num_results <= threshold:
            doc_srch.execute(client, get_all=True)
            data = doc_srch.results
            matched_papers = []
            for entry in data:
                paper_info = {
                    'scopus_id': entry.get('dc:identifier', '').replace('SCOPUS_ID:', ''),
                    'first_author': entry.get('dc:creator', '-'),
                    'year': entry.get('prism:coverDate', '-')[:4],
                    'title': entry.get('dc:title', '-'),
                    'journal': entry.get('prism:publicationName', '-'),
                    'citations': entry.get('citedby-count', '0'),
                    'open_access': entry.get('openaccess', '-'),
                    'link': entry.get('link', [{}])[2].get('@href', '#')  # Usually the third link is the scopus link
                }
                matched_papers.append(paper_info)
            match_count = len(matched_papers)
            return match_count, matched_papers
        else:
            return num_results, set()

    except RequestException as req_err:
        # Handle issues with network or API request
        logger.error(f"Network or API request error during Scopus API call: {req_err}")
        return 0, set()

    except KeyError as key_err:
        # Handle missing data in the response
        logger.error(f"Missing expected data in Scopus API response: {key_err}")
        return 0, set()

    except Exception as e:
        # General catch-all for other unforeseen errors
        logger.error(f"Unexpected error during Scopus API call: {e}")
        return 0, set()

extract_seed(files)

Extract text from a list of uploaded PDF files and return a list of preprocessed strings.

Each element in the returned list corresponds to the text content of one PDF document. The text is preprocessed (e.g., cleaned of stopwords and punctuation) for further use.

Parameters: - files (list): A list of PDF files uploaded by the user.

Returns: - list: A list of preprocessed text strings, where each string represents the content of one PDF file.

Source code in backend\processing.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def extract_seed(files):
    """
    Extract text from a list of uploaded PDF files and return a list of preprocessed strings.

    Each element in the returned list corresponds to the text content of one PDF document.
    The text is preprocessed (e.g., cleaned of stopwords and punctuation) for further use.

    Parameters:
    - files (list): A list of PDF files uploaded by the user.

    Returns:
    - list: A list of preprocessed text strings, where each string represents the content of one PDF file.
    """
    seed_texts = []
    for file in files:
        text = extract_text_from_pdf(file)
        seed_texts.append(text)
    return seed_texts

extract_text_from_pdf(file)

Extract and preprocess text from a single PDF file.

This function reads through all the pages of the provided PDF file, extracts the text, and performs preprocessing (such as cleaning, lowercasing, and tokenization) on the extracted text for further use in natural language processing tasks.

Parameters: - file: A PDF file object from which text needs to be extracted.

Returns: - str: A single preprocessed string containing the combined text from all pages of the PDF, ready for further analysis.

Source code in backend\processing.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def extract_text_from_pdf(file):
    """
    Extract and preprocess text from a single PDF file.

    This function reads through all the pages of the provided PDF file, extracts the text, and performs preprocessing
    (such as cleaning, lowercasing, and tokenization) on the extracted text for further use in natural language processing tasks.

    Parameters:
    - file: A PDF file object from which text needs to be extracted.

    Returns:
    - str: A single preprocessed string containing the combined text from all pages of the PDF, ready for further analysis.
    """
    try:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            extracted_text = page.extract_text()
            if extracted_text:
                text += extracted_text + " "

        # Preprocess the extracted text
        text = preprocess_text(text)
        return text

    except PdfReadError as e:
        # Raise a custom error and stop processing
        raise PDFProcessingError(f"Error processing PDF file '{file}': {e}")
    except Exception as e:
        # Raise a custom error for any other general issue
        raise PDFProcessingError(f"An unexpected error occurred while processing '{file}': {e}")

get_keywords(seed_texts, num_keywords)

Extract the top 'num_keywords' keywords from a list of documents using TF-IDF (Term Frequency-Inverse Document Frequency).

This function performs the following steps: - Combines custom stop words with standard English stop words. - Initializes a TfidfVectorizer to convert the text data into a matrix of TF-IDF features, considering unigrams only. - Fits the TF-IDF model to the provided documents (seed_texts). - Sums the TF-IDF scores across all documents to rank the importance of each keyword. - Filters out numbers and stop words from the resulting keywords.

Parameters: - seed_texts (list of str): A list where each element represents the (filtered) text of a document. - num_keywords (int): The number of top keywords to extract.

  • list of dict: A list of dictionaries containing the top keywords and their corresponding TF-IDF scores. Each dictionary has two keys:
    • 'word': The keyword.
    • 'weight': The TF-IDF score, rounded to two decimal places.
Source code in backend\processing.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def get_keywords(seed_texts, num_keywords):
    """
    Extract the top 'num_keywords' keywords from a list of documents using TF-IDF (Term Frequency-Inverse Document Frequency).

    This function performs the following steps:
    - Combines custom stop words with standard English stop words.
    - Initializes a TfidfVectorizer to convert the text data into a matrix of TF-IDF features, considering unigrams only.
    - Fits the TF-IDF model to the provided documents (seed_texts).
    - Sums the TF-IDF scores across all documents to rank the importance of each keyword.
    - Filters out numbers and stop words from the resulting keywords.

    Parameters:
    - seed_texts (list of str): A list where each element represents the (filtered) text of a document.
    - num_keywords (int): The number of top keywords to extract.

    Returns:
    - list of dict: A list of dictionaries containing the top keywords and their corresponding TF-IDF scores.
      Each dictionary has two keys:
        - 'word': The keyword.
        - 'weight': The TF-IDF score, rounded to two decimal places.
    """

    # Combine with English stop words from TfidfVectorizer
    combined_stop_words = list(set(ENGLISH_STOP_WORDS).union(additional_stop_words))

    # Initialize TfidfVectorizer with extended stop words and improved tokenization
    vectorizer = TfidfVectorizer(
        stop_words=combined_stop_words,  # Now a list
        max_features=num_keywords,
        token_pattern=r'\b[a-zA-Z]{2,}\b',  # Tokens with at least two letters
        ngram_range=(1, 1),  # Include unigrams and bigrams
        smooth_idf=True,
        sublinear_tf=True
    )

    # Fit and transform the list of documents
    tfidf_matrix = vectorizer.fit_transform(seed_texts)
    feature_names = vectorizer.get_feature_names_out()

    # Sum TF-IDF scores across all documents
    scores = tfidf_matrix.sum(axis=0).A1  # Convert to 1D array
    keywords = sorted(zip(feature_names, scores), key=lambda x: x[1], reverse=True)

    # Filter and select top keywords
    filtered_keywords = []
    for word, weight in keywords:
        words = word.split()
        if any(w in combined_stop_words for w in words):
            continue
        if re.search(r'\d', word):
            continue
        filtered_keywords.append({'word': word, 'weight': round(weight, 2)})
        if len(filtered_keywords) == num_keywords:
            break

    return filtered_keywords

preprocess_text(text)

Preprocess the input text by applying various cleaning and filtering steps for NLP tasks.

This function performs the following operations: - Converts the text to lowercase. - Removes URLs and numeric values. - Utilizes spaCy for lemmatization, and filters out stop words, punctuation, and non-alphabetic tokens. - Excludes tokens related to specific named entities (organizations, people, geopolitical entities, dates) and certain parts of speech (proper nouns, numbers). - Custom stop words are also removed.

Parameters: - text (str): The raw text to be processed.

Returns: - str: A cleaned and preprocessed string where tokens have been lemmatized and unnecessary elements have been removed.

Source code in backend\processing.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def preprocess_text(text):
    """
    Preprocess the input text by applying various cleaning and filtering steps for NLP tasks.

    This function performs the following operations:
    - Converts the text to lowercase.
    - Removes URLs and numeric values.
    - Utilizes spaCy for lemmatization, and filters out stop words, punctuation, and non-alphabetic tokens.
    - Excludes tokens related to specific named entities (organizations, people, geopolitical entities, dates) and certain parts of speech (proper nouns, numbers).
    - Custom stop words are also removed.

    Parameters:
    - text (str): The raw text to be processed.

    Returns:
    - str: A cleaned and preprocessed string where tokens have been lemmatized and unnecessary elements have been removed.
    """
    # Convert text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove numbers
    text = re.sub(r'\b\d+\b', '', text)


    # Use spaCy to process text and remove stop words, punctuation, etc.
    doc = nlp(text)
    cleaned_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct and token.is_alpha:
            # Exclude certain named entities or tokens
            if token.text in additional_stop_words:
                continue
            if token.ent_type_ in {'ORG', 'PERSON', 'GPE', 'DATE'}:
                continue
            if token.pos_ in {'PROPN', 'NUM'}:
                continue
            cleaned_tokens.append(token.lemma_)

    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

scopus_sampling_process(weight_dict, threshold, outer_iterations=5, progress_callback=None, scopus_api_key=None)

Perform the sampling process using Scopus API with outer and inner iterations.

Parameters: - weight_dict: Dict of keywords and their weights. - threshold: The match count threshold. - outer_iterations: Number of separate sampling runs. - progress_callback: Function to call with progress updates. - scopus_api_key: Dict containing 'apikey' and 'insttoken'.

Returns: - ranked_papers: List of dictionaries containing paper information, sorted by occurrences.

Source code in backend\processing.py
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
def scopus_sampling_process(weight_dict, threshold, outer_iterations=5, progress_callback=None, scopus_api_key=None):
    """
    Perform the sampling process using Scopus API with outer and inner iterations.

    Parameters:
    - weight_dict: Dict of keywords and their weights.
    - threshold: The match count threshold.
    - outer_iterations: Number of separate sampling runs.
    - progress_callback: Function to call with progress updates.
    - scopus_api_key: Dict containing 'apikey' and 'insttoken'.

    Returns:
    - ranked_papers: List of dictionaries containing paper information, sorted by occurrences.
    """
    if not scopus_api_key:
        logger.warning("No Scopus API Key provided. Cannot perform real sampling.")
        return []

    keywords = list(weight_dict.keys())
    weights = list(weight_dict.values())

    paper_rank_counts = {}

    for outer in range(1, outer_iterations + 1):
        print(f"\n--- Outer Iteration {outer} ---")
        search_keywords = []
        while True:
            selected_keyword = weighted_random_selection(keywords, weights)
            if not selected_keyword:
                logger.warning("No keyword selected. Ending inner iterations.")
                break
            # Prevent adding duplicate keywords
            if selected_keyword in search_keywords:
                print(f"Keyword '{selected_keyword}' already in query. Selecting a different keyword.")
                continue
            search_keywords.append(selected_keyword)
            query = construct_search_query(search_keywords)
            match_count, matched_papers = execute_search_scopus(query, scopus_api_key, threshold)
            print(f"Added '{selected_keyword}' | Query: '{query}' | Matches: {match_count}")

            # Update progress
            if progress_callback:
                progress_callback(outer, query, match_count)

            if match_count < threshold:
                print(f"Match count {match_count} below threshold {threshold}. Ending inner iterations.")
                break

            # Add a small delay to simulate processing time
            time.sleep(0.1)

        # Record matched papers from the final inner iteration
        for paper in matched_papers:
            scopus_id = paper['scopus_id']
            if scopus_id in paper_rank_counts:
                paper_rank_counts[scopus_id]['occurrences'] += 1
            else:
                paper['occurrences'] = 1
                paper_rank_counts[scopus_id] = paper

            #print(f"Recorded paper: {scopus_id} | Current occurrences: {paper_rank_counts[scopus_id]['occurrences']}")

        # Add a small delay after each outer iteration
        time.sleep(0.1)

    # Create a ranked list sorted by occurrences descending
    ranked_papers = sorted(paper_rank_counts.values(), key=lambda x: x['occurrences'], reverse=True)
    print("\n--- Sampling Completed ---")
    return ranked_papers

weighted_random_selection(keywords, weights)

Select a keyword based on weights.

Parameters: - keywords: List of keywords. - weights: Corresponding list of weights.

Returns: - Selected keyword or None if no selection possible.

Source code in backend\processing.py
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def weighted_random_selection(keywords, weights):
    """
    Select a keyword based on weights.

    Parameters:
    - keywords: List of keywords.
    - weights: Corresponding list of weights.

    Returns:
    - Selected keyword or None if no selection possible.
    """
    total_weight = sum(weights)
    if total_weight == 0:
        logger.info("Total weight is zero. No keyword can be selected.")
        return None
    probabilities = [w / total_weight for w in weights]
    selected_keyword = np.random.choice(keywords, p=probabilities)

    try:
        selected_index = keywords.index(selected_keyword)
        weight = weights[selected_index]
        print(f"Selected keyword: '{selected_keyword}' with weight {weight}", flush=True)
    except ValueError:
        logger.error(f"Selected keyword '{selected_keyword}' not found in keywords list.", flush=True)

    return selected_keyword