Requires account upgrade to view this reply

También pasa en el subforo de la 3GM

Empezó en Archivo y Coronabicho, y después se ha ido extendiendo a otros hilos.

A qué se debe y para qué, lo desconocemos. Los más inocentes creen que se trata de algún problema, otros apuestan por que muy pronto el floro pase a ser de pago y otros dicen que hay algo muy turbio detrás...
 
Sube a algún sitio comprimidos los hilos que hayas guardado.

Pesan mucho y los tengo dispersos en carpetas por temáticas, por ahora paso.

Como me caes bien, te dejo un par de scripts de Python para hacerles una poda y que sean más ligeros (me ayudé de chatgpt).

Los dos primeros toman como argumento un archivo HTML generado a partir de la extensión que enlacé antes. Ejemplo de uso:

Bash:
python script.py <html_file>

Python:
#!/usr/bin/env python
"""Elimina firmas, encabezados y otros elementos para hacer menos pesado el html de Burbuja.info"""

import sys
from bs4 import BeautifulSoup
from datetime import datetime


if len(sys.argv) != 2:
    print("Usage: python script.py <html_file>")
    sys.exit(1)

html_file_path = sys.argv[1]
html_file_output=sys.argv[1] +'comp.html'

# Read the HTML content from the file
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()


# Read the HTML content from the file
#with open('file.html', 'r', encoding='utf-8') as file:
#    html_content = file.read()


# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Find and remove elements with class 'message-signature' and 'message-actionBar'
classes_to_remove = ['message-signature', 'message-actionBar','p-sectionLinks','p-nav-inner','p-footer','message message--quickReply block-topRadiusContent block-bottomRadiusContent','message-attribution-opposite message-attribution-opposite--list']

for class_name in classes_to_remove:
    for element in soup.find_all(class_=class_name):
        element.decompose()


try:
    # Find all time elements with the specific format
    time_elements = soup.find_all('time', class_='u-dt', text=lambda text: 'ayer' in text.lower())
 
    # Replace each time element with the current date and time
    for time_element in time_elements:
        # Extract the date and time from the element attributes
        date_string = time_element['data-date-string']
        time_string = time_element['data-time-string']
 
        # Combine the date and time strings and parse into datetime object
        datetime_str = f'{date_string} {time_string}'
        parsed_datetime = datetime.strptime(datetime_str, '%d %b %Y %I:%M %p')
 
        # Get the current date and time
        current_datetime = datetime.now()
 
        # Replace the element attributes with the current date and time
        time_element['data-date-string'] = current_datetime.strftime('%d %b %Y')
        time_element['data-time-string'] = current_datetime.strftime('%I:%M %p')
        time_element['datetime'] = current_datetime.strftime('%Y-%m-%dT%H:%M:%S%z')
        time_element['title'] = current_datetime.strftime('%d %b %Y a la(s) %I:%M %p')
 
        # Update the element text
        updated_time_str = parsed_datetime.strftime('%d %b %Y a la(s) %I:%M %p')
        time_element.string = updated_time_str
except:
    print("time error")

# Write the modified HTML back to the file
with open(html_file_output, 'w', encoding='utf-8') as file:
    file.write(soup.prettify())

Python:
#!/usr/bin/env python
# Extrae solo texto plano (revisar  citas a usuarios)
import re
import requests
from bs4 import BeautifulSoup

def extract_messages(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    messages = []

    # Find all message content elements
    message_contents = soup.find_all('div', class_='message-content js-messageContent')

    # Extract the text content of each message and add to the list
    for message_content in message_contents:
        message_text = message_content.get_text(strip=True)
        cleaned_message = remove_expandable_content(message_text)
        messages.append(cleaned_message)

    return messages

def extract_users(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    users = []

    # Find all message user details elements
    user_details = soup.find_all('div', class_='message-userDetails')

    # Extract the text content of each user detail and add to the list
    for user_detail in user_details:
        user_text = user_detail.get_text(strip=True)
        cleaned_user = remove_expandable_content(user_text)
        users.append(cleaned_user)

    return users

def remove_expandable_content(text):
    # Define the pattern to match strings starting with 'dijo:' and ending with 'click para expandir...'
    pattern = re.compile(r'dijo:.*?clic para expandir\.\.\.', re.DOTALL)

    # Remove the matched content from text
    cleaned_text = re.sub(pattern, ' fue citado ]\n ', text)

    return cleaned_text

if __name__ == "__main__":
    import sys

    if len(sys.argv) != 2:
        print("Usage: python script.py <html_file_or_url>")
        sys.exit(1)

    input_path = sys.argv[1]

    if input_path.startswith('http://') or input_path.startswith('https://'):
        # Fetch HTML content from the URL
        response = requests.get(input_path)
        html_content = response.text
    else:
        # Read HTML content from the local file
        with open(input_path, 'r', encoding='utf-8') as file:
            html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    # Encuentra y elimina todos los elementos <h5> con la clase especificada
    user_titles = soup.find_all('h5', class_='userTitle message-userTitle')
    for user_title in user_titles:
        user_title.decompose()

    # Extract and process messages from the modified HTML content
    users = extract_users(str(soup))
    messages = extract_messages(str(soup))

    # Print the extracted and processed messages
    for i in range(0, len(messages)):
        print(f"{i+1}:({users[i]}): {messages[i]}\n")


Python:
#!/usr/bin/env python
# Extrae enlaces de hilos de burbuja.info cuyos títulos contienen cifras numéricas en el subforo de Economía
# Salida: CSV

import requests
import re
from bs4 import BeautifulSoup

# Make a request to the webpage
enlaces=[]
titulos=[]
id=0
for i in range(1,10):
    website="https://www.burbuja.info/inmobiliaria/forums/economia/page-"+str(i)
    page = requests.get(website)
 
    patterns=r'\d+'
 
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(page.content, "html.parser")
 
    # Find all the links on the page
    links = soup.find_all("a")
 
 
    # Extract the href attribute from each link
    for link in links:
        url='https://www.burbuja.info'+str(link.get("href"))
        title=str(link.string)
 
        timer=bool(re.search(r'\d{1,2}\s\w{3}\s\d{4}|hace \d+ minutos| a la\(s\) |Hace un momento|Hace 1 minuto',title)) #print(timer,title,'\n')
        patron=bool(re.search(patterns,title))
        if 'threads' in url and timer==False and patron==True and len(title)>4: #and len(title)>25:
         #print(title,'#                                                                                                                                                                                                                                                           ',url)
             enlaces.append([title,url])
             titulos.append([title])
         #print("")
         #print(link)

import pandas
import os
pandas.DataFrame(enlaces).to_csv('/usr/share/nginx/html/static/bbja.csv',sep=';')
 
Última edición:
It's possible so, but maybe not. The reality of realities is that they really are real when they actually become reality.

How to speak in the language of the forum in which you speak.

Nota: Hick

Traducción: Hick, pueblerino.
 
DZpITM6W0AA6NyF
 
si quereis premium gratis es un mes, si quereis gratis un año mandarme mp
 
Volver