DONE!
This commit is contained in:
968
sysahelper/FINAL2023/downloaded_pages/page_363_embedded.html
Normal file
968
sysahelper/FINAL2023/downloaded_pages/page_363_embedded.html
Normal file
File diff suppressed because one or more lines are too long
1190
sysahelper/FINAL2023/downloaded_pages/page_364_embedded.html
Normal file
1190
sysahelper/FINAL2023/downloaded_pages/page_364_embedded.html
Normal file
File diff suppressed because one or more lines are too long
710
sysahelper/FINAL2023/downloaded_pages/page_365_embedded.html
Normal file
710
sysahelper/FINAL2023/downloaded_pages/page_365_embedded.html
Normal file
File diff suppressed because one or more lines are too long
837
sysahelper/FINAL2023/downloaded_pages/page_366_embedded.html
Normal file
837
sysahelper/FINAL2023/downloaded_pages/page_366_embedded.html
Normal file
File diff suppressed because one or more lines are too long
714
sysahelper/FINAL2023/downloaded_pages/page_367_embedded.html
Normal file
714
sysahelper/FINAL2023/downloaded_pages/page_367_embedded.html
Normal file
File diff suppressed because one or more lines are too long
891
sysahelper/FINAL2023/downloaded_pages/page_368_embedded.html
Normal file
891
sysahelper/FINAL2023/downloaded_pages/page_368_embedded.html
Normal file
File diff suppressed because one or more lines are too long
812
sysahelper/FINAL2023/downloaded_pages/page_369_embedded.html
Normal file
812
sysahelper/FINAL2023/downloaded_pages/page_369_embedded.html
Normal file
File diff suppressed because one or more lines are too long
710
sysahelper/FINAL2023/downloaded_pages/page_370_embedded.html
Normal file
710
sysahelper/FINAL2023/downloaded_pages/page_370_embedded.html
Normal file
File diff suppressed because one or more lines are too long
761
sysahelper/FINAL2023/downloaded_pages/page_371_embedded.html
Normal file
761
sysahelper/FINAL2023/downloaded_pages/page_371_embedded.html
Normal file
File diff suppressed because one or more lines are too long
824
sysahelper/FINAL2023/downloaded_pages/page_372_embedded.html
Normal file
824
sysahelper/FINAL2023/downloaded_pages/page_372_embedded.html
Normal file
File diff suppressed because one or more lines are too long
786
sysahelper/FINAL2023/downloaded_pages/page_373_embedded.html
Normal file
786
sysahelper/FINAL2023/downloaded_pages/page_373_embedded.html
Normal file
File diff suppressed because one or more lines are too long
819
sysahelper/FINAL2023/downloaded_pages/page_374_embedded.html
Normal file
819
sysahelper/FINAL2023/downloaded_pages/page_374_embedded.html
Normal file
File diff suppressed because one or more lines are too long
733
sysahelper/FINAL2023/downloaded_pages/page_375_embedded.html
Normal file
733
sysahelper/FINAL2023/downloaded_pages/page_375_embedded.html
Normal file
File diff suppressed because one or more lines are too long
688
sysahelper/FINAL2023/downloaded_pages/page_376_embedded.html
Normal file
688
sysahelper/FINAL2023/downloaded_pages/page_376_embedded.html
Normal file
File diff suppressed because one or more lines are too long
698
sysahelper/FINAL2023/downloaded_pages/page_377_embedded.html
Normal file
698
sysahelper/FINAL2023/downloaded_pages/page_377_embedded.html
Normal file
File diff suppressed because one or more lines are too long
692
sysahelper/FINAL2023/downloaded_pages/page_378_embedded.html
Normal file
692
sysahelper/FINAL2023/downloaded_pages/page_378_embedded.html
Normal file
File diff suppressed because one or more lines are too long
792
sysahelper/FINAL2023/downloaded_pages/page_379_embedded.html
Normal file
792
sysahelper/FINAL2023/downloaded_pages/page_379_embedded.html
Normal file
File diff suppressed because one or more lines are too long
669
sysahelper/FINAL2023/downloaded_pages/page_380_embedded.html
Normal file
669
sysahelper/FINAL2023/downloaded_pages/page_380_embedded.html
Normal file
File diff suppressed because one or more lines are too long
634
sysahelper/FINAL2023/downloaded_pages/page_381_embedded.html
Normal file
634
sysahelper/FINAL2023/downloaded_pages/page_381_embedded.html
Normal file
File diff suppressed because one or more lines are too long
753
sysahelper/FINAL2023/downloaded_pages/page_382_embedded.html
Normal file
753
sysahelper/FINAL2023/downloaded_pages/page_382_embedded.html
Normal file
File diff suppressed because one or more lines are too long
1000
sysahelper/FINAL2023/downloaded_pages/page_383_embedded.html
Normal file
1000
sysahelper/FINAL2023/downloaded_pages/page_383_embedded.html
Normal file
File diff suppressed because one or more lines are too long
190
sysahelper/FINAL2023/main.py
Normal file
190
sysahelper/FINAL2023/main.py
Normal file
@@ -0,0 +1,190 @@
|
||||
import requests
|
||||
import os
|
||||
import base64
|
||||
from typing import List
|
||||
from urllib.parse import urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
def embed_images_in_html(session: requests.Session, html_content: str, base_url: str) -> str:
|
||||
"""
|
||||
Находит все изображения в HTML, скачивает их и встраивает как base64.
|
||||
|
||||
Args:
|
||||
session: Сессия requests с куками
|
||||
html_content: Исходный HTML
|
||||
base_url: Базовый URL страницы для разрешения относительных путей
|
||||
|
||||
Returns:
|
||||
HTML с встроенными изображениями
|
||||
"""
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
img_tags = soup.find_all('img')
|
||||
|
||||
print(f" Найдено изображений: {len(img_tags)}")
|
||||
|
||||
embedded_count = 0
|
||||
failed_count = 0
|
||||
|
||||
for idx, img_tag in enumerate(img_tags, 1):
|
||||
img_src = img_tag.get('src')
|
||||
if not img_src:
|
||||
failed_count += 1
|
||||
continue
|
||||
|
||||
# Формируем полный URL
|
||||
img_url = urljoin(base_url, img_src)
|
||||
|
||||
# Пропускаем внешние изображения (не с нашего домена) и пустые/якорные ссылки
|
||||
if img_src.startswith('data:') or img_src.startswith('#') or img_src.startswith('javascript:'):
|
||||
continue
|
||||
|
||||
print(f" [{idx}/{len(img_tags)}] Встраиваю: {img_url[:70]}...")
|
||||
|
||||
try:
|
||||
# Скачиваем изображение
|
||||
response = session.get(img_url, timeout=15)
|
||||
response.raise_for_status()
|
||||
|
||||
# Определяем MIME-тип
|
||||
content_type = response.headers.get('content-type', '')
|
||||
if not content_type:
|
||||
# Определяем по расширению
|
||||
if img_url.lower().endswith('.png'):
|
||||
content_type = 'image/png'
|
||||
elif img_url.lower().endswith('.jpg') or img_url.lower().endswith('.jpeg'):
|
||||
content_type = 'image/jpeg'
|
||||
elif img_url.lower().endswith('.gif'):
|
||||
content_type = 'image/gif'
|
||||
elif img_url.lower().endswith('.webp'):
|
||||
content_type = 'image/webp'
|
||||
elif img_url.lower().endswith('.svg'):
|
||||
content_type = 'image/svg+xml'
|
||||
else:
|
||||
content_type = 'image/png'
|
||||
|
||||
# Кодируем в base64
|
||||
img_data = base64.b64encode(response.content).decode('utf-8')
|
||||
|
||||
# Формируем data URI
|
||||
data_uri = f"data:{content_type};base64,{img_data}"
|
||||
|
||||
# Заменяем src на data URI
|
||||
img_tag['src'] = data_uri
|
||||
embedded_count += 1
|
||||
print(f" ✓ Встроено ({len(response.content) // 1024} КБ)")
|
||||
|
||||
except Exception as e:
|
||||
failed_count += 1
|
||||
print(f" ✗ Ошибка: {type(e).__name__}")
|
||||
# Оставляем оригинальную ссылку (или удаляем тег)
|
||||
# img_tag.decompose() # раскомментировать чтобы удалить битые изображения
|
||||
|
||||
print(f" Итого: {embedded_count} встроено, {failed_count} ошибок")
|
||||
|
||||
return str(soup)
|
||||
|
||||
def download_web_pages(page_ids: List[str], session_cookie_value: str) -> None:
|
||||
"""
|
||||
Скачивает страницы и встраивает все изображения в HTML как base64.
|
||||
|
||||
Args:
|
||||
page_ids: Список ID страниц
|
||||
session_cookie_value: Значение куки MoodleSession
|
||||
"""
|
||||
session = requests.Session()
|
||||
|
||||
# Устанавливаем куку
|
||||
session.cookies.set(
|
||||
"MoodleSession",
|
||||
session_cookie_value,
|
||||
domain="sysahelper.ru",
|
||||
path="/"
|
||||
)
|
||||
|
||||
# Заголовки для имитации браузера
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Referer': 'https://sysahelper.ru/',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
|
||||
'Connection': 'keep-alive'
|
||||
})
|
||||
|
||||
base_url_template = "https://sysahelper.ru/mod/page/view.php?id="
|
||||
output_folder = "downloaded_pages"
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
print(f"Начинаю скачивание {len(page_ids)} страниц с встраиванием изображений...\n")
|
||||
|
||||
for idx, page_id in enumerate(page_ids, 1):
|
||||
page_url = f"{base_url_template}{page_id}"
|
||||
print(f"[{idx}/{len(page_ids)}] Скачиваю: {page_url}")
|
||||
|
||||
try:
|
||||
# Скачиваем страницу
|
||||
response = session.get(page_url, timeout=15)
|
||||
print(f" Статус: {response.status_code}")
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f" ✗ Ошибка HTTP {response.status_code}\n")
|
||||
continue
|
||||
|
||||
# Проверка авторизации
|
||||
if 'logout' in response.text.lower() or '/login/logout.php' in response.text:
|
||||
print(f" ✓ Авторизован")
|
||||
else:
|
||||
print(f" ⚠ Возможно, нет доступа к материалам")
|
||||
|
||||
# Встраиваем изображения
|
||||
modified_html = embed_images_in_html(session, response.text, page_url)
|
||||
|
||||
# Сохраняем в один файл
|
||||
filename = os.path.join(output_folder, f"page_{page_id}_embedded.html")
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
f.write(modified_html)
|
||||
|
||||
# Показываем размер файла
|
||||
file_size = os.path.getsize(filename) / 1024
|
||||
print(f" ✓ Сохранено: {filename} ({file_size:.1f} КБ)\n")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Ошибка: {type(e).__name__}: {e}\n")
|
||||
|
||||
print("="*60)
|
||||
print("Готово! Все страницы сохранены с встроенными изображениями.")
|
||||
print(f"Откройте файлы из папки '{output_folder}' в браузере.")
|
||||
print("="*60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ваши идентификаторы страниц
|
||||
page_ids = [
|
||||
"361",
|
||||
"362",
|
||||
"363",
|
||||
"364",
|
||||
"365",
|
||||
"366",
|
||||
"367",
|
||||
"368",
|
||||
"369",
|
||||
"370",
|
||||
"371",
|
||||
"372",
|
||||
"373",
|
||||
"374",
|
||||
"375",
|
||||
"376",
|
||||
"377",
|
||||
"378",
|
||||
"379",
|
||||
"380",
|
||||
"381",
|
||||
"382",
|
||||
"383"
|
||||
]
|
||||
|
||||
# Ваша сессионная кука Moodle
|
||||
SESSION_COOKIE_VALUE = "696jkc8dn5kgetquunv2uc72v2"
|
||||
|
||||
# Скачиваем страницы с встроенными изображениями
|
||||
download_web_pages(page_ids, SESSION_COOKIE_VALUE)
|
||||
Reference in New Issue
Block a user