optimized sitemap search

This commit is contained in:
Faisal Shahzad 2023-12-22 18:09:32 +01:00
parent cbd21ac068
commit bc121722a7
4 changed files with 38 additions and 28 deletions

View file

@ -41,7 +41,7 @@ from bs4 import BeautifulSoup
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

from ..core.constants import CONFIGS
from ..core.utils import get_clean_url, get_remote_content
from ..core.utils import get_clean_url, get_remote_content, is_url_valid

# +++++++++++++++++++++++++++++++++++++++++++++++++++++
# IMPLEMENATIONS
@ -57,27 +57,29 @@ def find_sitemap_location(home_url_: str) -> str:
Returns:
str: Location of Sitemap
"""
for sitemap_path in CONFIGS["SITEMAP"]["SEARCH_PATHS"]:
sitemap_url = get_clean_url(home_url_, sitemap_path)
response = get_remote_content(sitemap_url)
if response.status_code < 400:
return parse.urlparse(response.url).path
if is_url_valid(home_url_):
for sitemap_path in CONFIGS["SITEMAP"]["SEARCH_PATHS"]:
sitemap_url = get_clean_url(home_url_, sitemap_path)
response = get_remote_content(sitemap_url)
if response.status_code < 400:
return parse.urlparse(response.url).path

# robots.txt
robots_txt = get_clean_url(home_url_, "robots.txt")
response = get_remote_content(robots_txt)
if response:
for item in response.text.split("\n"):
if item.startswith("Sitemap:"):
return item.split("Sitemap:")[-1].strip()
# robots.txt
robots_txt = get_clean_url(home_url_, "robots.txt")
response = get_remote_content(robots_txt)
if response:
for item in response.text.split("\n"):
if item.startswith("Sitemap:"):
return item.split("Sitemap:")[-1].strip()

# check home page for link rel=sitemap
response = get_remote_content(home_url_)
if response:
soup = BeautifulSoup(response.text, features="xml")
for link in soup.find_all("link"):
if link.has_attr("sitemap"):
return link["href"]
# check home page for link rel=sitemap
response = get_remote_content(home_url_)
if response:
soup = BeautifulSoup(response.text, features="xml")
for link in soup.find_all("link"):
if link.has_attr("sitemap"):
return link["href"]
return ""
return ""



View file

@ -228,6 +228,12 @@ def is_url_valid(url_: str) -> bool:
url_parsed_ = parse.urlparse(url_)

if all([url_parsed_.scheme, url_parsed_.netloc]):
return get_remote_content(url_parsed_, max_retires=1).status_code < 399
from urllib.request import urlopen

# print(url_parsed_)
# # return get_remote_content(url_parsed_, max_retires=1).status_code < 399
try:
return urlopen(url_).getcode() < 399
except:
return False
return False

View file

@ -378,7 +378,7 @@ class StaticWordPressGUI(QMainWindow):
self._project.open(project_path)
if self._project.is_open():
project_dialog = ProjectDialog(
self, self._project, title_="Project Properties"
parent=self, project_=self._project, title_="Project Properties"
)

if project_dialog.exec_():
@ -756,7 +756,8 @@ class StaticWordPressGUI(QMainWindow):
def update_widgets(self) -> None:
self.findChild(QMenu, "menu_github").setEnabled(self._project.has_github())
self.findChild(QMenu, "menu_wordpress").setEnabled(
self._project.has_wordpress() or self._project.can_crawl()
self._project.is_open()
and (self._project.has_wordpress() or self._project.can_crawl())
)
self.findChild(QToolBar, "toolbar_github").setEnabled(
self._project.has_github()

View file

@ -323,7 +323,7 @@ class ProjectDialog(QDialog):
self.pushbutton_verify.setIcon(
QIcon(f"{SHARE_FOLDER_PATH}/icons/check_project.svg")
)
self.pushbutton_verify.clicked.connect(self.check_project)
self.pushbutton_verify.clicked.connect(self.verify_project_settings)

self.pushbutton_save = QPushButton("&Save")
self.pushbutton_save.setIcon(QIcon(f"{SHARE_FOLDER_PATH}/icons/ok.svg"))
@ -398,10 +398,11 @@ class ProjectDialog(QDialog):
self._bg_worker.emit_sitemap_location.connect(self.update_sitemap_location)
self._bg_thread.start()

def update_sitemap_location(self, sitemap_location):
self.lineedit_sitemap.setText(sitemap_location)
def update_sitemap_location(self, sitemap_location_):
if sitemap_location_:
self.lineedit_sitemap.setText(sitemap_location_)

def check_project(self):
def verify_project_settings(self):
""""""
# TODO: Add checks for WP_API and Gh_API and if not present then disable them.
# TODO: Move these checks to background thread e.g. for WP_API or SRC_URL or SRC or DST Path
@ -455,7 +456,7 @@ class ProjectDialog(QDialog):
[
self.lineedit_project_name.text(),
self.lineedit_output.text(),
is_url_valid(self.lineedit_src_url.text()),
# is_url_valid(self.lineedit_src_url.text()),
Path(self.lineedit_output.text()).is_dir(),
]
):