diff --git a/application/parser/remote/web_loader.py b/application/parser/remote/web_loader.py index a19e0c90a..cc1cdcb85 100644 --- a/application/parser/remote/web_loader.py +++ b/application/parser/remote/web_loader.py @@ -1,5 +1,7 @@ from application.parser.remote.base import BaseRemote +from application.parser.schema.base import Document from langchain_community.document_loaders import WebBaseLoader +from urllib.parse import urlparse headers = { "User-Agent": "Mozilla/5.0", @@ -23,10 +25,20 @@ def load_data(self, inputs): urls = [urls] documents = [] for url in urls: + # Check if the URL scheme is provided, if not, assume http + if not urlparse(url).scheme: + url = "http://" + url try: loader = self.loader([url], header_template=headers) - documents.extend(loader.load()) + loaded_docs = loader.load() + for doc in loaded_docs: + documents.append( + Document( + doc.page_content, + extra_info=doc.metadata, + ) + ) except Exception as e: print(f"Error processing URL {url}: {e}") continue - return documents + return documents \ No newline at end of file