Descargar publicaciones de Instagram usando el módulo Python Selenium

En este artículo, aprenderemos cómo podemos descargar publicaciones de Instagram de un perfil utilizando el módulo Python Selenium .

Requisitos:

  • Google Chrome o Firefox
  • Controlador Chrome (para Google Chrome) o controlador Gecko (para Mozilla Firefox)
  • Paquete Selenium: Es una poderosa herramienta para controlar un navegador web a través del programa. Es funcional para todos los navegadores, funciona en todos los principales sistemas operativos y sus scripts están escritos en varios idiomas, es decir, Python, Java, C#, etc. Se puede instalar con el siguiente comando:
pip install selenium 
  • Paquete Beautiful Soap : es una biblioteca de Python para extraer datos de archivos HTML y XML. Funciona con su analizador favorito para proporcionar formas idiomáticas de navegar, buscar y modificar el árbol de análisis. Se puede instalar con el siguiente comando:
pip install bs4
  • Paquete de requests: la biblioteca de requests es una parte integral de Python para realizar requests HTTP a una URL específica. Se puede instalar usando el siguiente comando:
pip install requests

Enfoque paso a paso:

Paso 1: importar módulos e ingresar la información de inicio de sesión junto con la URL de la página. 

Python3

# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
 
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ')
 
# assign URL
url = 'https://instagram.com/' + \
    input('Enter User Name Of User For Downloading Posts ')

Paso 2: Función para iniciar la nueva sesión de Navegador. Es posible que deba agregar la ruta al controlador web. Función Chrome(), depende de su instalación.

Python3

# get URL path
def path():
    global chrome
     
    # starts a new chrome session
    # add path if required
    chrome = webdriver.Chrome()

Paso 3: Función para ingresar la URL de la página. 

Python3

# extract URL
def url_name(url):
   
    # the web page opens up
    chrome.get(url)
     
    # webdriver will wait for 4 sec before throwing a
    # NoSuchElement exception so that the element
    # is detected and not skipped.
    time.sleep(4)

Paso 4: Función para ingresar su información de inicio de sesión. 

Python3

# login to access post
def login(username, your_password):
    log_but = chrome.find_element_by_class_name("L3NKy")
    time.sleep(2)
    log_but.click()
    time.sleep(4)
     
    # finds the username box
    usern = chrome.find_element_by_name("username")
     
    # sends the entered username
    usern.send_keys(username)
 
    # finds the password box
    passw = chrome.find_element_by_name("password")
 
    # sends the entered password
    passw.send_keys(your_password)
 
    # sends the enter key
    passw.send_keys(Keys.RETURN)
 
    time.sleep(5.5)
 
    # Find Not Now  Button
    notn = chrome.find_element_by_class_name("yWX7d")
 
    notn.click()
    time.sleep(3)

Paso 5: Función para abrir la primera publicación.

Python3

# function to get first post
def first_post():
    pic = chrome.find_element_by_class_name("kIKUG").click()
    time.sleep(2)

Paso 6: Función para descargar todas las publicaciones.

Python3

def download_allposts():
 
    # open First Post
    first_post()
 
    user_name = url.split('/')[-1]
 
    # check if folder corresponding to user name exist or not
    if(os.path.isdir(user_name) == False):
 
        # Create folder
        os.mkdir(user_name)
 
    # Check if Posts contains multiple images or videos
    multiple_images = nested_check()
 
    if multiple_images:
        nescheck = multiple_images
        count_img = 0
         
        while nescheck:
            elem_img = chrome.find_element_by_class_name('rQDP3')
 
            # Function to save nested images
            save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
            count_img += 1
            nescheck.click()
            nescheck = nested_check()
 
        # pass last_img_flag True
        save_multiple(user_name+'/'+'content1.' +
                      str(count_img), elem_img, last_img_flag=1)
    else:
        save_content('_97aPb', user_name+'/'+'content1')
    c = 2
     
    while(True):
        next_el = next_post()
         
        if next_el != False:
            next_el.click()
            time.sleep(1.3)
             
            try:
                multiple_images = nested_check()
                 
                if multiple_images:
                    nescheck = multiple_images
                    count_img = 0
                     
                    while nescheck:
                        elem_img = chrome.find_element_by_class_name('rQDP3')
                        save_multiple(user_name+'/'+'content' +
                                      str(c)+'.'+str(count_img), elem_img)
                        count_img += 1
                        nescheck.click()
                        nescheck = nested_check()
                    save_multiple(user_name+'/'+'content'+str(c) +
                                  '.'+str(count_img), elem_img, 1)
                else:
                    save_content('_97aPb', user_name+'/'+'content'+str(c))
             
            except selenium.common.exceptions.NoSuchElementException:
                print("finished")
                return
         
        else:
            break
         
        c += 1

Paso 7: Función para hacer clic en la siguiente publicación.

Python3

# function to get next post
def next_post():
    try:
        nex = chrome.find_element_by_class_name("coreSpriteRightPaginationArrow")
        return nex
    except selenium.common.exceptions.NoSuchElementException:
        return 0

Paso 8: Función para guardar publicaciones normales.

Python3

# Function to save content of the current post
def save_content(class_name,img_name):
    time.sleep(0.5)
     
    try:
        pic = chrome.find_element_by_class_name(class_name)
     
    except selenium.common.exceptions.NoSuchElementException:
        print("Either This user has no images or you haven't followed this user or something went wrong")
        return
     
    html = pic.get_attribute('innerHTML')
    soup = bs(html,'html.parser')
    link = soup.find('video')
     
    if link:
        link = link['src']
    else:
        link = soup.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
     
    time.sleep(0.9)

Paso 9: función para guardar publicaciones anidadas.

Python3

# Function to save multiple posts
def save_multiple(img_name,elem,last_img_flag = False):
    time.sleep(1)
    l = elem.get_attribute('innerHTML')
    html = bs(l,'html.parser')
    biglist = html.find_all('ul')
    biglist = biglist[0]
    list_images = biglist.find_all('li')
    if last_img_flag:
        user_image = list_images[-1]
    else:
        user_image = list_images[(len(list_images)//2)]
    video = user_image.find('video')
    if video:
        link = video['src']
    else:
        link = user_image.find('img')['src']
    response = requests.get(link)
    with open(img_name, 'wb') as f:
        f.write(response.content)

Paso 10: función para verificar si la publicación está anidada o no.

Python3

# function to check if the post is nested
def nested_check():
   
    try:
        time.sleep(1)
        nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron  ')
        return nes_nex
     
    except selenium.common.exceptions.NoSuchElementException:
        return 0

Paso 11: Llamar a las funciones requeridas en el código del controlador.

Python3

# Driver Code
path()
time.sleep(1)
url_name(url)
login(username, password)
download_allposts()
chrome.close()

A continuación se muestra el programa completo basado en el enfoque anterior:

Python3

# import required modules
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import selenium.common.exceptions
import time
from bs4 import BeautifulSoup as bs
import requests
import os
 
 
# get instagram account credentials
username = input('Enter Your User Name ')
password = input('Enter Your Password ') 
 
# assign URL
url = 'https://instagram.com/' + \
    input('Enter User Name Of User For Downloading Posts ')
 
# Get URL path
def path():
    global chrome
    # starts a new chrome session
    # add path if required
    chrome = webdriver.Chrome()
     
# Extract URL
def url_name(url):
    # the web page opens up
    chrome.get(url)
     
    # webdriver will wait for 4 sec before throwing a
    # NoSuchElement exception so that the element
    # is detected and not skipped.
    time.sleep(4)
     
# Login to access post
def login(username, your_password):
    log_but = chrome.find_element_by_class_name("L3NKy")
    time.sleep(2)
    log_but.click()
    time.sleep(4)
    # finds the username box
    usern = chrome.find_element_by_name("username")
    # sends the entered username
    usern.send_keys(username)
 
    # finds the password box
    passw = chrome.find_element_by_name("password")
 
    # sends the entered password
    passw.send_keys(your_password)
 
    # sends the enter key
    passw.send_keys(Keys.RETURN)
 
    time.sleep(5.5)
 
    # Find Not Now  Button
    notn = chrome.find_element_by_class_name("yWX7d")
 
    notn.click()
    time.sleep(3)
     
# Function to get content of first post
def first_post():
    pic = chrome.find_element_by_class_name("kIKUG").click()
    time.sleep(2)
     
# Function to get next post
def next_post():
    try:
        nex = chrome.find_element_by_class_name(
            "coreSpriteRightPaginationArrow")
        return nex
    except selenium.common.exceptions.NoSuchElementException:
        return 0
       
# Download content of all posts
def download_allposts():
 
    # open First Post
    first_post()
 
    user_name = url.split('/')[-1]
 
    # check if folder corresponding to user name exist or not
    if(os.path.isdir(user_name) == False):
 
        # Create folder
        os.mkdir(user_name)
 
    # Check if Posts contains multiple images or videos
    multiple_images = nested_check()
 
    if multiple_images:
        nescheck = multiple_images
        count_img = 0
         
        while nescheck:
            elem_img = chrome.find_element_by_class_name('rQDP3')
 
            # Function to save nested images
            save_multiple(user_name+'/'+'content1.'+str(count_img), elem_img)
            count_img += 1
            nescheck.click()
            nescheck = nested_check()
 
        # pass last_img_flag True
        save_multiple(user_name+'/'+'content1.' +
                      str(count_img), elem_img, last_img_flag=1)
    else:
        save_content('_97aPb', user_name+'/'+'content1')
    c = 2
     
    while(True):
        next_el = next_post()
         
        if next_el != False:
            next_el.click()
            time.sleep(1.3)
             
            try:
                multiple_images = nested_check()
                 
                if multiple_images:
                    nescheck = multiple_images
                    count_img = 0
                     
                    while nescheck:
                        elem_img = chrome.find_element_by_class_name('rQDP3')
                        save_multiple(user_name+'/'+'content' +
                                      str(c)+'.'+str(count_img), elem_img)
                        count_img += 1
                        nescheck.click()
                        nescheck = nested_check()
                    save_multiple(user_name+'/'+'content'+str(c) +
                                  '.'+str(count_img), elem_img, 1)
                else:
                    save_content('_97aPb', user_name+'/'+'content'+str(c))
             
            except selenium.common.exceptions.NoSuchElementException:
                print("finished")
                return
         
        else:
            break
         
        c += 1
 
# Function to save content of the current post
def save_content(class_name, img_name):
    time.sleep(0.5)
     
    try:
        pic = chrome.find_element_by_class_name(class_name)
     
    except selenium.common.exceptions.NoSuchElementException:
        print("Either This user has no images or you haven't followed this user or something went wrong")
        return
     
    html = pic.get_attribute('innerHTML')
    soup = bs(html, 'html.parser')
    link = soup.find('video')
     
    if link:
        link = link['src']
     
    else:
        link = soup.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
    time.sleep(0.9)
     
# Function to save multiple posts
def save_multiple(img_name, elem, last_img_flag=False):
    time.sleep(1)
    l = elem.get_attribute('innerHTML')
    html = bs(l, 'html.parser')
    biglist = html.find_all('ul')
    biglist = biglist[0]
    list_images = biglist.find_all('li')
     
    if last_img_flag:
        user_image = list_images[-1]
     
    else:
        user_image = list_images[(len(list_images)//2)]
    video = user_image.find('video')
     
    if video:
        link = video['src']
     
    else:
        link = user_image.find('img')['src']
    response = requests.get(link)
     
    with open(img_name, 'wb') as f:
        f.write(response.content)
 
# Function to check if the post is nested
def nested_check():
     
    try:
        time.sleep(1)
        nes_nex = chrome.find_element_by_class_name('coreSpriteRightChevron  ')
        return nes_nex
     
    except selenium.common.exceptions.NoSuchElementException:
        return 0
 
# Driver Code
path()
time.sleep(1)
 
url_name(url)
 
login(username, password)
 
download_allposts()
 
chrome.close()

Después de ejecutar este script completo, se creará un directorio que contendrá todas las publicaciones.

Producción: 

Nota: si es un usuario de Windows, las publicaciones se guardarán con la extensión .file , abra las publicaciones con una aplicación que pueda abrir tanto imágenes como videos (las publicaciones de Instagram solo tienen un tipo de medio, imagen o video)  

Publicación traducida automáticamente

Artículo escrito por UnworthyProgrammer y traducido por Barcelona Geeks. The original can be accessed here. Licence: CCBY-SA

Deja una respuesta

Tu dirección de correo electrónico no será publicada. Los campos obligatorios están marcados con *