KRALL

Sleeping

App Files Files Community

KRALL / app.py

acecalisto3

Update app.py

490f5f3 verified 18 days ago

raw

history blame contribute delete

8.1 kB

	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	import re
	from requests.sessions import Session
	from langdetect import detect, LangDetectException
	from deep_translator import GoogleTranslator
	from requests.exceptions import RequestException
	import time

	# --- CONSTANTS ---
	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5"
	}

	# --- HELPER FUNCTIONS ---

	def get_session():
	"""Creates a session with standard headers."""
	session = Session()
	session.headers.update(HEADERS)
	return session

	def perform_login(session, login_url, email, password):
	"""
	Attempts a login.
	ENHANCEMENT: Fetches the page first to find hidden CSRF tokens.
	"""
	try:
	# 1. Get the login page first to set cookies/CSRF tokens
	page = session.get(login_url, timeout=10)
	page.raise_for_status()

	soup = BeautifulSoup(page.content, 'html.parser')
	payload = {
	'email': email, # Note: Some sites use 'username', this is specific to target
	'password': password
	}

	# 2. Find hidden inputs (often required for CSRF protection)
	form = soup.find('form')
	if form:
	for input_tag in form.find_all('input', type='hidden'):
	name = input_tag.get('name')
	value = input_tag.get('value')
	if name and value:
	payload[name] = value

	# 3. Post credentials
	response = session.post(login_url, data=payload, timeout=10)
	response.raise_for_status()
	return True, "Login request sent."
	except Exception as e:
	return False, str(e)

	def chunk_text(text, max_chars=4500):
	"""Splits text into chunks to respect translation API limits."""
	return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

	def translate_content(text, target_lang='en'):
	"""
	Translates text using Deep Translator (more stable).
	Optimized to check language first, then translate in chunks.
	"""
	if not text or len(text) < 5:
	return text

	try:
	# Check language first to avoid unnecessary API calls
	detected_lang = detect(text[:1000]) # Detect based on first 1000 chars
	if detected_lang == target_lang:
	return text
	except LangDetectException:
	pass # If detection fails, attempt translation anyway

	translator = GoogleTranslator(source='auto', target=target_lang)
	chunks = chunk_text(text)
	translated_chunks = []

	progress_bar = st.progress(0)

	for i, chunk in enumerate(chunks):
	try:
	translated = translator.translate(chunk)
	translated_chunks.append(translated)
	except Exception:
	# Fallback: append original if translation fails
	translated_chunks.append(chunk)

	# Update progress
	progress_bar.progress((i + 1) / len(chunks))

	progress_bar.empty() # Remove bar when done
	return " ".join(translated_chunks)

	def scrape_url(url, query_selector=None, auth_details=None):
	session = get_session()

	# --- Authentication Phase ---
	if auth_details and auth_details.get('login_url'):
	st.info("🔄 Attempting authentication...")
	success, msg = perform_login(
	session,
	auth_details['login_url'],
	auth_details['email'],
	auth_details['password']
	)
	if not success:
	st.error(f"Authentication failed (proceeding as guest): {msg}")
	else:
	st.success("Authentication request sent (Session updated).")

	# --- Fetching Phase ---
	try:
	response = session.get(url, timeout=15)
	response.raise_for_status()
	except RequestException as e:
	return None, f"Network Error: {e}"

	# --- Parsing Phase ---
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove clutter
	for tag in soup(["script", "style", "meta", "link", "noscript", "header", "footer", "nav", "aside", "svg", "iframe", "ad"]):
	tag.decompose() # decompose is faster than extract

	# Extraction
	if query_selector:
	elements = soup.select(query_selector)
	if not elements:
	return None, "Query selector found no elements."
	# Get text with separator to prevent words merging
	text_content = "\n".join([el.get_text(separator=' ', strip=True) for el in elements])
	else:
	# Get all visible text from body, using separator
	if soup.body:
	text_content = soup.body.get_text(separator=' ', strip=True)
	else:
	text_content = soup.get_text(separator=' ', strip=True)

	# Clean up whitespace
	clean_text = re.sub(r'\s+', ' ', text_content).strip()

	return clean_text, None

	# --- MAIN APP ---

	def main():
	st.set_page_config(page_title="Universal Web Scraper", page_icon="🕷️", layout="wide")

	st.title("🕷️ Universal Web Scraper & Translator")
	st.markdown("---")

	# --- Sidebar Inputs ---
	with st.sidebar:
	st.header("⚙️ Configuration")
	url_input = st.text_input("Target URL", placeholder="https://example.com")
	query_selector = st.text_input("CSS Selector (Optional)", placeholder="e.g., div.article-content")
	enable_translation = st.checkbox("Enable Auto-Translation", value=True)

	with st.expander("🔐 Authentication (Advanced)"):
	st.caption("Only use if the content is behind a login.")
	login_url = st.text_input("Login Page URL")
	email = st.text_input("Email/Username")
	password = st.text_input("Password", type="password")

	# --- Main Logic ---
	if st.button("🚀 Start Scraping", type="primary"):
	if not url_input:
	st.warning("Please enter a URL to proceed.")
	return

	auth_details = None
	if login_url and email and password:
	auth_details = {'login_url': login_url, 'email': email, 'password': password}

	with st.spinner("Fetching and processing data..."):
	# 1. Scrape
	scraped_text, error = scrape_url(url_input, query_selector, auth_details)

	if error:
	st.error(error)
	elif not scraped_text:
	st.warning("No text content found at this URL. The site might be JavaScript-rendered (SPA).")
	else:
	# 2. Translate (if enabled)
	final_text = scraped_text
	if enable_translation:
	with st.status("Detecting language and translating...", expanded=True) as status:
	final_text = translate_content(scraped_text)
	status.update(label="Processing Complete!", state="complete", expanded=False)

	# 3. Display Results
	st.success(f"Successfully extracted {len(final_text)} characters.")

	tab1, tab2 = st.tabs(["📄 Cleaned Text", "🔍 Raw Preview"])

	with tab1:
	st.text_area("Content", final_text, height=400)

	# Download Button
	st.download_button(
	label="📥 Download Text",
	data=final_text,
	file_name="scraped_data.txt",
	mime="text/plain"
	)

	with tab2:
	st.json({
	"source": url_input,
	"length": len(final_text),
	"selector_used": query_selector if query_selector else "Body (Default)",
	"snippet": final_text[:500] + "..."
	})

	if __name__ == "__main__":
	main()