commit 861ab98af2f993f8c350f20c612f7dc892286f4f Author: Louis Mylle Date: Wed Sep 10 21:29:59 2025 +0200 Add web scraper implementation using Selenium diff --git a/main.py b/main.py new file mode 100644 index 0000000..12aee1a --- /dev/null +++ b/main.py @@ -0,0 +1,200 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from selenium.webdriver.chrome.options import Options +import time +import random + +class Scraper: + def __init__(self, headless=False): + chrome_options = Options() + if headless: + chrome_options.add_argument('--headless') + + # Make it look more human + chrome_options.add_argument('--disable-blink-features=AutomationControlled') + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) + chrome_options.add_experimental_option('useAutomationExtension', False) + chrome_options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36') + + self.driver = webdriver.Chrome(options=chrome_options) + self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") + + def human_delay(self, min_sec=0.5, max_sec=2): + time.sleep(random.uniform(min_sec, max_sec)) + + def human_type(self, element, text): + for char in text: + element.send_keys(char) + time.sleep(random.uniform(0.05, 0.15)) + + def navigate(self, url): + self.driver.get(url) + self.human_delay(1, 3) + + def login(self, username, password): + self.driver.get("https://eboek.info/komerin") + self.human_delay(2, 4) + + # Direct selectors based on what worked + username_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='text']") + self.human_type(username_field, username) + + self.human_delay(0.5, 1) + + password_field = self.driver.find_element(By.CSS_SELECTOR, "input[type='password']") + self.human_type(password_field, password) + + self.human_delay(0.5, 1.5) + + submit_button = self.driver.find_element(By.CSS_SELECTOR, "input[type='submit']") + submit_button.click() + + self.human_delay(2, 4) + + def trigger_download(self, url): + """Open URL in new tab to trigger browser download""" + # Store current window handle + current_window = self.driver.current_window_handle + + # Use JavaScript to open URL in new tab with same session + self.driver.execute_script(f"window.open('{url}', '_blank');") + + # Wait for download to complete and tab to auto-close + self.human_delay(3, 5) + + # Switch back to original window + self.driver.switch_to.window(current_window) + + print(f"Download triggered for: {url}") + + def scrape(self, start_page=1, end_page=1): + """Scrape comics from specified page range""" + base_url = "https://eboek.info/stripverhalen-alle" + + for page_num in range(start_page, end_page + 1): + # Construct page URL + if page_num == 1: + page_url = base_url + else: + page_url = f"{base_url}/page/{page_num}/" + + print(f"\n{'='*50}") + print(f"Processing page {page_num}: {page_url}") + print(f"{'='*50}") + + # Navigate to the page + self.navigate(page_url) + + # Scroll down a bit like a human would to see content + self.driver.execute_script("window.scrollTo(0, 300)") + self.human_delay(1, 2) + + # Find all comic strip links + comic_links = self.driver.find_elements(By.CSS_SELECTOR, 'h2.post-title a') + + print(f"Found {len(comic_links)} comic strips on page {page_num}") + + # Store URLs first to avoid stale element issues + comic_urls = [link.get_attribute('href') for link in comic_links] + + # Take a break between pages (more likely and longer) + if page_num > start_page: + if random.random() < 0.7: # 70% chance of break + break_time = random.uniform(15, 45) # 15-45 seconds + print(f"\nTaking a break between pages for {break_time:.1f} seconds...") + time.sleep(break_time) + else: + # Even if no long break, always pause a bit + short_break = random.uniform(5, 10) + print(f"\nQuick pause for {short_break:.1f} seconds...") + time.sleep(short_break) + + # Process all comics on this page + for i, url in enumerate(comic_urls, 1): + print(f"\nProcessing comic {i}/{len(comic_urls)} on page {page_num}: {url}") + + # Random chance to scroll on main page before clicking + if random.random() < 0.4: + scroll_amount = random.randint(100, 500) + self.driver.execute_script(f"window.scrollBy(0, {scroll_amount})") + self.human_delay(0.5, 1.5) + + # Open in new tab to keep main page + self.driver.execute_script("window.open('');") + self.driver.switch_to.window(self.driver.window_handles[-1]) + + try: + self.driver.get(url) + self.human_delay(2, 4) + + # Sometimes scroll down to see the content + if random.random() < 0.6: + self.driver.execute_script("window.scrollTo(0, 400)") + self.human_delay(0.5, 1.5) + + # Extract title + title = self.driver.find_element(By.CSS_SELECTOR, 'h1.entry-title').text + print(f"Title: {title}") + + # Small delay before clicking download + self.human_delay(0.8, 2) + + # Execute the downloadLinks() JavaScript function + self.driver.execute_script("downloadLinks()") + self.human_delay(1.5, 3) + + # Find all download links in the table + download_links = self.driver.find_elements(By.CSS_SELECTOR, 'table a') + + print(f"Found {len(download_links)} download links") + + # Trigger download for each file + for j, link in enumerate(download_links): + file_url = link.get_attribute('href') + file_name = link.text.strip() + + print(f"Triggering download: {file_name}") + self.trigger_download(file_url) + + # Small random delay between downloads + if j < len(download_links) - 1: + self.human_delay(0.5, 1.5) + + # Take a longer break every 5 comics + if i % 5 == 0 and i < len(comic_urls): + break_time = random.uniform(3, 7) + print(f"\nTaking a break for {break_time:.1f} seconds...") + time.sleep(break_time) + + except Exception as e: + print(f"Error processing {url}: {e}") + # Human would pause after an error + self.human_delay(2, 4) + + # Close tab and switch back + self.driver.close() + self.driver.switch_to.window(self.driver.window_handles[0]) + + # Vary the delay between comics + self.human_delay(1, 3) + + def close(self): + self.driver.quit() + +if __name__ == "__main__": + scraper = Scraper() + + # Login first + scraper.login("BorstBollen", "123456") + + # Ask which page(s) to scrape + start = int(input("Enter start page number (1 for first page): ")) + end = int(input("Enter end page number (same as start for single page): ")) + + # Scrape the specified pages + scraper.scrape(start_page=start, end_page=end) + + # Keep browser open + input("\nDone! Press Enter to close the browser...") + scraper.close() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..954f0db --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +selenium \ No newline at end of file