Skip to content

How to scratch TEMU #689

@KalvinThien

Description

@KalvinThien

I'm trying to scrape the information from TEMU https://www.temu.com/.
Processing https://www.temu.com/vn-en/2--car--------universal--sun----pvc---accessories-----g-601099650626830.html ...
Extracted Data:
Title: No title found.

image

`import sys
import asyncio
import time
from PyQt6.QtWidgets import QApplication, QMainWindow, QVBoxLayout, QWidget, QPushButton, QTextEdit, QLineEdit, QLabel
from PyQt6.QtCore import QThread, pyqtSignal
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext

class CrawlerThread(QThread):
log_signal = pyqtSignal(str)
data_signal = pyqtSignal(str)
runtime_signal = pyqtSignal(str)

def __init__(self, url):
    super().__init__()
    self.url = url
    self.request_count = 0
    self.failed_requests = 0
    self.total_duration = 0

async def run_crawler(self):
    crawler = PlaywrightCrawler(max_requests_per_crawl=1)

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext):
        self.log_signal.emit(f"Processing {context.request.url} ...")
        start_time = time.time()
        
        try:
            # Extract title from the specified div with class '_2rn4tqXP'
            title_element = context.page.locator('._2rn4tqXP')
            title = await title_element.inner_text() if await title_element.count() > 0 else "No title found."

            # Calculate request duration
            request_duration = time.time() - start_time
            self.request_count += 1
            self.total_duration += request_duration

            # Emit data signal
            self.data_signal.emit(f"Title: {title}\n")

        except Exception as e:
            self.failed_requests += 1
            self.log_signal.emit(f"Error: {e}")

    await crawler.run([self.url])

    # Calculate and emit runtime statistics
    average_duration = self.total_duration / self.request_count if self.request_count > 0 else 0
    runtime_stats = (
        f"Requests Finished: {self.request_count}\n"
        f"Requests Failed: {self.failed_requests}\n"
        f"Average Request Duration: {average_duration:.2f} seconds\n"
        f"Total Runtime: {self.total_duration:.2f} seconds"
    )
    self.runtime_signal.emit(runtime_stats)

def run(self):
    asyncio.run(self.run_crawler())

class MainWindow(QMainWindow):
def init(self):
super().init()
self.setWindowTitle("Web Data Crawler")
self.setGeometry(100, 100, 800, 600)

    # Widgets
    self.url_input = QLineEdit()
    self.url_input.setPlaceholderText("Enter URL here")
    self.start_button = QPushButton("Start Crawling")
    self.output_area = QTextEdit()
    self.output_area.setReadOnly(True)
    self.runtime_label = QLabel("Runtime Statistics:")

    # Layout
    layout = QVBoxLayout()
    layout.addWidget(self.url_input)
    layout.addWidget(self.start_button)
    layout.addWidget(self.output_area)
    layout.addWidget(self.runtime_label)
    container = QWidget()
    container.setLayout(layout)
    self.setCentralWidget(container)

    # Connections
    self.start_button.clicked.connect(self.start_crawling)

def start_crawling(self):
    url = self.url_input.text().strip()
    if not url:
        self.output_area.setText("Please enter a valid URL.")
        return

    self.output_area.clear()

    # Run the crawler in a separate thread
    self.crawler_thread = CrawlerThread(url)
    self.crawler_thread.log_signal.connect(self.update_output)
    self.crawler_thread.data_signal.connect(self.display_data)
    self.crawler_thread.runtime_signal.connect(self.display_runtime)
    self.crawler_thread.start()

def update_output(self, text):
    self.output_area.append(text)

def display_data(self, data):
    self.output_area.append("Extracted Data:\n" + data)

def display_runtime(self, runtime):
    self.runtime_label.setText("Runtime Statistics:\n" + runtime)

app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec())

`

I've tried but they all return no find.can someone help me?

Metadata

Metadata

Assignees

No one assigned

    Labels

    t-toolingIssues with this label are in the ownership of the tooling team.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions