import pandas as pd
import numpy as np
import sqlite3
from pprint import pprint
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
import time
from random import randint
from time import sleep
import warnings
warnings.filterwarnings('ignore')


Topmanga = requests.get("https://myanimelist.net/topmanga.php?limit=0")
Topmanga_tree = BeautifulSoup(Topmanga.content, "html.parser")       #We have to do this for each page

Pagenbr = np.arange(0, 56450, 50).tolist()  #so we create a list to go to each page
Pagelist = []


for i in Pagenbr:
    Page = "https://myanimelist.net/topmanga.php?limit=" + str(i) +""
    Pagelist.append(Page)


#Let's create a function that wi will apply to each page to retrieve every link.

#Linklist = []

def link_retriever(page): #here the page must be a str
    Page = requests.get(page)
    Page_tree = BeautifulSoup(Page.content, "html.parser")
    for manga in Page_tree.find_all('h3'):
        for link in manga.find_all('a'):
            if 'manga' in link.get('href'):
                #print(link.get('href'))
                if link.get('href') not in Linklist:
                    Linklist.append(link.get('href'))


for i in Pagelist:
    print(i)
    print(len(Linklist))
    link_retriever(i)


pprint(len(Linklist))

56407


def anydup(thelist):
  seen = set()
  for x in thelist:
    if x in seen: return True
    seen.add(x)
  return False


df = pd.DataFrame(Linklist, columns = ['Webpage'])
#del df


df[['Name', 'Status', 'Type', 'Rating', 'Raters', 'Members', 'Started', 'Ended', 'Demographic', 'Serialization', 'Chapters']] = pd.DataFrame([['', '', '', '', '', '', '', '', '', '', '',]], index=df.index)
df


df.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPages.csv', index = False)


#https://myanimelist.net/manga/2/Berserk

Mangainfo = requests.get('https://myanimelist.net/manga/2/Berserk')
Mangainfo_tree = BeautifulSoup(Mangainfo.content, "html.parser")

infolist = []
infolistclean = []

for info in Mangainfo_tree.find_all('div', {"class": "spaceit_pad"}):
    infolist.append(info.get_text().strip())

for i in infolist:
    splitter = ":"
    if splitter in i:
        splitlist = i.split(splitter, 1)
        i = splitlist[1].strip()
        infolistclean.append(i)

#pprint(infolistclean)

['Berserk',
 'Berserk: The Prototype',
 'ベルセルク',
 'Manga',
 '41',
 '380',
 'On Hiatus',
 'Aug  25, 1989 to Sep  10, 2021',
 'Action\n'
 '\n'
 '                Action,                    Adventure\n'
 '\n'
 '                Adventure,                    Drama\n'
 '\n'
 '                Drama,                    Fantasy\n'
 '\n'
 '                Fantasy,                    Horror\n'
 '\n'
 '                Horror,                    Supernatural\n'
 '\n'
 '                Supernatural',
 'Demons\n'
 '\n'
 '                Demons,                    Military\n'
 '\n'
 '                Military,                    Psychological\n'
 '\n'
 '                Psychological',
 'Seinen\n\n                Seinen',
 'Young Animal',
 'Miura, Kentarou (Story & Art)',
 '9.411 (scored by 214699214,699 users)1 indicates a weighted score.',
 "#12 2 based on the top manga page. Please note that 'R18+' titles are "
 'excluded.',
 '#2',
 '452,586',
 '84,928']


dffilled = pd.read_csv('MangaPages.csv')

columnstr = ['Name', 'Status', 'Type', 'Started', 'Demographic', 'Serialization']

for column in columnstr:
    dffilled[column]=dffilled[column].astype(str)


def infofiller(row):
    Mangainfo = requests.get(dffilled.at[row, 'Webpage'])
    Mangainfo_tree = BeautifulSoup(Mangainfo.content, "html.parser")
    infolist = []
    dictclean = {}
    for info in Mangainfo_tree.find_all('div', {"class": "spaceit_pad"}):
        infolist.append(info.get_text().strip())
    for i in infolist:
        splitter = ":"
        if splitter in i:
            dictclean[(str(i.split(splitter, 1)[0]))] = (i.split(splitter, 1)[1]).strip()
    
    if 'English' in dictclean.keys():
        dffilled.at[row, 'Name'] = dictclean['English']
    elif 'Synonyms' in dictclean.keys():
        dffilled.at[row, 'Name'] = dictclean['Synonyms']
    elif 'Japanese' in dictclean.keys():
        dffilled.at[row, 'Name'] = dictclean['Japanese']
    else:
        dffilled.at[row, 'Name'] = np.nan
        
    if 'Status' in dictclean.keys():
        dffilled.at[row, 'Status'] = dictclean['Status']
    else:
        dffilled.at[row, 'Status'] = np.nan
    
    dffilled.at[row, 'Type'] = dictclean['Type']
    
    if 'N/A' in dictclean['Score']:
        dffilled.at[row, 'Rating'] = np.nan
    else:
        dffilled.at[row, 'Rating'] = float(dictclean['Score'].split(' ', 1)[0].split('1', 1)[0])

        raters = dictclean['Score'].split(' ', 4)[3].replace(",","")
        raters = int(raters[:len(raters)//2])
        dffilled.at[row, 'Raters'] = raters

    members = int(dictclean['Members'].replace(",",""))
    dffilled.at[row, 'Members'] = members

    if dictclean['Published'] == 'Not available':
        dffilled.at[row, 'Started'] = np.nan
        dffilled.at[row, 'Ended'] = np.nan
    
    elif dictclean['Published'] == 'N/A':
        dffilled.at[row, 'Started'] = np.nan
        dffilled.at[row, 'Ended'] = np.nan

    else:
        startdate = dictclean['Published'].split('to', 1)[0].replace(",","").strip().replace(" ","",1)
        if len(startdate) < 5:
            startdate2 = datetime.strptime(startdate, '%Y').date()
            dffilled.at[row, 'Started'] = startdate2
        elif len(startdate) < 9:
            startdate2 = datetime.strptime(startdate, '%b %Y').date()
            dffilled.at[row, 'Started'] = startdate2
        else:
            startdate2 = datetime.strptime(startdate, '%b %d %Y').date()
            dffilled.at[row, 'Started'] = startdate2

        if 'to' in dictclean['Published']:
            enddate = dictclean['Published'].split('to', 1)[1].replace(",","").strip().replace(" ","",1)
        else:
            enddate = '?'

        if enddate != '?':
            if len(enddate) < 5:
                enddate2 = datetime.strptime(enddate, '%Y').date()
                dffilled.at[row, 'Ended'] = enddate2
            elif len(enddate) < 9:
                enddate2 = datetime.strptime(enddate, '%b %Y').date()
                dffilled.at[row, 'Ended'] = enddate2
            else:
                enddate2 = datetime.strptime(enddate, '%b %d %Y').date()
                dffilled.at[row, 'Ended'] = enddate2
        else:
            dffilled.at[row, 'Ended'] = np.nan

    if 'Demographics' in dictclean.keys():
        Dem = dictclean['Demographics'].split('\n', 1)[0]
        dffilled.at[row, 'Demographic'] = Dem
    elif 'Demographic' in dictclean.keys():
        Dem = dictclean['Demographic'].split('\n', 1)[0]
        dffilled.at[row, 'Demographic'] = Dem
    else:
        dffilled.at[row, 'Demographic'] = np.nan

    dffilled.at[row, 'Serialization'] = dictclean['Serialization'].strip()

    if dictclean['Chapters'] != 'Unknown':
        dffilled.at[row, 'Chapters'] = int(dictclean['Chapters'].strip())
    else:
        dffilled.at[row, 'Chapters'] = np.nan


#indexlist1 = np.arange(8807, 26407, 40).tolist()
indexlist2 = np.arange(26406, 56407, 40).tolist()
errorlist = []


dffilled = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')

dffilled['Name'] = dffilled['Name'].astype(str)
dffilled['Status'] = dffilled['Status'].astype(str)


counter = 0
for index in indexlist2:
    print(index)
    indexstart = index - 40
    for i, row in dffilled[indexstart:index].iterrows():
        if dffilled.at[i, 'Name'] == 'nan':
            try:
                infofiller(i)
            except (Exception) as e:
                print('Error at index {}: {!r}'.format(i, row))
                print(e)
                errorlist.append(i)
                time.sleep(180)
            if counter % 10 == 0:
                time.sleep(randint(10,35))
            if counter % 1000 == 0:
                print('we are at the' + str(i) + 'th value')
                dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp2.csv', index = False)
        counter =+ 1
    time.sleep(randint(30,70))


pd.set_option("display.max_rows", 30)

infofiller(9431)

dffilled[9430:9431]

print(len(errorlist))
print(errorlist)

textfile = open("a_file.txt", "w")
for element in errorlist:
    textfile.write(str(element) + "\n")
textfile.close()

129
[26458, 27606, 27847, 28726, 29246, 29646, 30086, 30446, 31046, 31206, 31406, 31606, 31926, 32206, 33291, 33686, 34606, 35166, 35646, 35886, 37966, 38449, 38614, 38772, 39206, 39366, 39526, 39561, 39766, 39926, 40086, 40236, 40486, 40646, 40808, 41046, 41326, 41566, 41726, 41966, 42126, 42286, 42566, 42751, 43007, 43206, 43368, 43526, 43686, 43846, 44007, 44166, 44366, 44394, 44590, 44598, 44605, 44687, 44846, 45006, 45166, 45326, 45414, 45568, 45726, 45886, 46086, 46246, 46406, 46567, 46726, 46867, 47047, 47215, 47366, 47442, 47606, 47766, 47926, 48086, 48246, 48359, 48686, 48846, 49006, 49166, 49366, 49526, 49686, 49926, 50046, 50206, 50277, 50278, 50446, 50615, 50755, 50877, 51046, 51206, 51406, 51566, 51726, 51926, 52086, 52262, 52486, 52686, 52846, 53006, 53167, 53446, 53606, 53806, 54006, 54166, 54366, 54526, 54692, 54846, 54853, 55046, 55206, 55366, 55526, 55726, 56047, 56206, 56363]


print(len(errorlist))
errorlist2 = []
counter2 = 0

for i in errorlist:
    print(i)
    if dffilled.at[i, 'Name'] == 'nan':
        try:
            infofiller(i)
        except (Exception) as e:
            print('Error at index {}: {!r}'.format(i, row))
            print(e)
            errorlist2.append(i)
            time.sleep(180)
        if counter % 10 == 0:
            time.sleep(randint(10,35))
        if counter % 1000 == 0:
            print('we are at the' + i + 'th value')
            dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv', index = False)
    counter2 =+ 1
    if counter%20 == 0:
        time.sleep(randint())
        print('+ 20 ezclap')


dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv', index = False)


#dffilled = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')


dffilled['Name'] = dffilled['Name'].astype(str)
dffilled['Status'] = dffilled['Status'].astype(str)

nameerrors = dffilled[(dffilled['Name'] == 'nan') & (dffilled['Status'] != 'nan')]

nameerrors


print(len(errorlist3))
print(errorlist3)

fullerrors2 = dffilled[(dffilled['Name'] == 'nan') & (dffilled['Status'] == 'nan')]

fullerrors2

169
[5739, 5908, 6078, 6254, 6278, 6448, 6625, 6626, 9497, 9667, 9836, 10011, 10029, 10184, 10343, 10344, 18366, 18532, 18694, 18695, 18869, 19041, 19213, 19214, 19379, 19568, 19741, 19896, 20078, 20263, 20436, 20597, 20744, 20904, 21045, 21214, 21215, 21373, 21547, 21722, 21893, 21894, 22071, 22245, 22418, 22580, 22581, 22749, 22918, 23078, 23079, 23251, 23433, 23591, 23592, 23765, 23925, 23926, 24102, 24276, 24334, 24504, 24505, 24678, 24800, 24957, 25014, 25174, 25175, 25345, 25493, 25494, 25505, 25669, 25843, 25844, 26006, 26007, 26169, 26334, 26335, 50046, 50277, 50278, 50615, 50755, 50877, 56363, 5580, 5737, 5900, 6062, 6231, 6278, 6451, 6615, 9476, 9477, 9645, 9799, 9959, 10029, 10202, 13367, 18223, 18397, 18398, 18570, 18738, 18928, 19090, 19091, 19251, 19425, 19609, 19610, 19783, 19938, 20109, 20292, 20367, 20537, 20538, 20719, 20898, 21059, 21236, 21395, 21564, 21727, 21889, 21890, 22002, 22169, 22335, 22501, 22660, 22818, 22992, 23154, 23310, 23453, 23618, 23777, 23930, 24085, 24247, 24334, 24498, 24670, 24800, 24962, 25014, 25185, 25340, 25480, 25505, 25665, 25819, 25998, 26148, 26318, 50046, 50277, 50278, 50615, 50755, 50877, 56363]


dffilled2 = pd.read_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp.csv')

dffilled2['Name'] = dffilled2['Name'].astype(str)
dffilled2['Status'] = dffilled2['Status'].astype(str)


fullerrors3 = dffilled2[(dffilled['Name'] == 'nan') & (dffilled2['Status'] == 'nan')]

lasterrorslist = [20367, 20538, 22002]


counter2 = 0
for ind in lasterrorslist:
    print(ind)
    if dffilled.at[i, 'Name'] == 'nan':
        print(i)
        try:
            infofiller(i)
        except (Exception) as e:
            print(e)
            #errorlist4.append(ind)
            time.sleep(180)
        if counter2 % 10 == 0:
            time.sleep(randint(10,35))
        if counter2 % 1000 == 0:
            print('we are at the' + str(ind) + 'th value')
    counter2 =+ 1
    if counter2%20 == 0:
        time.sleep(randint())
        print('+ 20 ezclap')

20367
20538
22002


infofiller(20367)
infofiller(20538)
infofiller(22002)


dffilled.loc[[20367, 20538, 22002]]


dffilled.to_csv(r'C:/Users/sarac/Desktop/Projects/Mangas/MangaPagesTemp3.csv', index = False)

	Webpage	Name	Status	Type	Rating	Raters	Members	Started	Ended	Demographic	Serialization	Chapters
0	https://myanimelist.net/manga/2/Berserk
1	https://myanimelist.net/manga/1706/JoJo_no_Kim...
2	https://myanimelist.net/manga/13/One_Piece
3	https://myanimelist.net/manga/656/Vagabond
4	https://myanimelist.net/manga/1/Monster
...	...	...	...	...	...	...	...	...	...	...	...	...
56402	https://myanimelist.net/manga/88606/Triple_Ojo...
56403	https://myanimelist.net/manga/88607/Ore_ga♀de_...
56404	https://myanimelist.net/manga/88608/Ojousama_t...
56405	https://myanimelist.net/manga/88609/Neratta_On...
56406	https://myanimelist.net/manga/88610/Mazomai__M...

	Webpage	Name	Status	Type	Rating	Raters	Members	Started	Ended	Demographic	Serialization	Chapters
3052	https://myanimelist.net/manga/7071/B-Eyes	nan	Finished	Manga	7.43	425.0	865.0	2000-01-01	NaN	Shoujo	None	17.0
3170	https://myanimelist.net/manga/13070/Wait_Wolf	nan	Finished	Manhwa	7.42	460.0	1412.0	NaN	NaN	NaN	None	NaN
3488	https://myanimelist.net/manga/4795/Rurouni_Ken...	nan	Finished	Light Novel	7.38	437.0	1399.0	1996-10-01	NaN	Shounen	None	19.0
3580	https://myanimelist.net/manga/3070/Shokugyou_T...	nan	Finished	Manga	7.37	421.0	804.0	2003-01-01	NaN	NaN	None	4.0
3721	https://myanimelist.net/manga/4657/Winter_Flowers	nan	Finished	One-shot	7.35	1500.0	2520.0	2007-01-01	NaN	Shoujo	Cheese!	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...
48786	https://myanimelist.net/manga/43335/Brother_X_...	nan	Finished	One-shot	NaN	NaN	45.0	NaN	NaN	NaN	None	1.0
48876	https://myanimelist.net/manga/45193/Hideout_ZIN	nan	Finished	One-shot	NaN	NaN	192.0	NaN	NaN	NaN	None	1.0
48878	https://myanimelist.net/manga/45227/Sweet_Drug	nan	Finished	One-shot	NaN	NaN	130.0	NaN	NaN	NaN	None	1.0
48909	https://myanimelist.net/manga/45615/Takatoh-sa...	nan	Finished	Manga	NaN	NaN	83.0	NaN	NaN	NaN	Magazine Be x Boy	8.0
52587	https://myanimelist.net/manga/59381/Switch_dj_...	nan	Finished	Doujinshi	NaN	NaN	273.0	2009-08-16	NaN	NaN	None	1.0

	Webpage	Name	Status	Type	Rating	Raters	Members	Started	Ended	Demographic	Serialization	Chapters
20367	https://myanimelist.net/manga/121902/Hiiragi-s...	nan	nan	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
20538	https://myanimelist.net/manga/123086/Egao_no_T...	nan	nan	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
22002	https://myanimelist.net/manga/132258/Botsuichi	nan	nan	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Webpage	Name	Status	Type	Rating	Raters	Members	Started	Ended	Demographic	Serialization	Chapters
20367	https://myanimelist.net/manga/121902/Hiiragi-s...	Everyday in a Vampire Family	Finished	One-shot	6.68	145.0	387.0	2019-09-11	NaN	Shounen	Shounen Magazine (Weekly)	1.0
20538	https://myanimelist.net/manga/123086/Egao_no_T...	A Workplace Where You Can't Help But Smile	Publishing	Manga	NaN	NaN	307.0	2019-05-18	NaN	NaN	Comic Days	NaN
22002	https://myanimelist.net/manga/132258/Botsuichi	没イチ	Publishing	Manga	NaN	NaN	16.0	2020-12-08	NaN	Seinen	Evening	NaN

In this project I aim to create a database filled with information about mangas, volume sales, adapatations linked to the manga and many more data that can be used for analysis later.¶

In this project I will make use of pandas dataframe to clean and analyse, webscraping to gather the data and SQL to create and fill my database.¶

We are going to scrap https://myanimelist.net/ because they have a huge catalogue of both manga and their adaptation, they also include ratings which can be useful for the analysis part.¶

We have 56407 manga links we will get out information from.¶

We check if we have duplicates¶

At this step we want to go through each row of the dataframe below and get the information we need¶

Let's Store it¶

We now have the page for each manga. The goal now is to go through each page and gather the data we need. We can try on one page and create a function that we will apply to every row.¶

Let's try with Berserk #RipMiura :(¶

Let's change some columns type to strings.¶

Lets create the main function so we can apply it to every row. This function will take the information and store it in a dictionary, from there we just retrieve the info from the dict and transform it before storing it in the dataframe.¶

Lets' start the webscraping, we converted the name into str because when we stored np.nan's in the CSV, it's stored as text.¶

Here we can see our error list and rerun the webscraping, if the error was because the access to the website was blocked it will be solved directly, if not we can fix our code to fit the needs.¶

Let's store it in a CSV¶

We have a list of the rows where there was no error but no name either, this means that the name wasn't in the page at the place we took it for others. But this is not an issue as those are alternative names, we will take the real name from the link we already webscraped.¶

We are done for the webscraping part, we will clean this data and transform it according to our needs but we don't need to gather more info at this step.¶