rewrite fanza.py

This commit is contained in:
jnozsc 2020-02-17 10:47:11 -08:00
parent 690557f878
commit 5f46f3f25d
2 changed files with 191 additions and 86 deletions

View File

@ -423,7 +423,7 @@ def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filep
print(" <num>" + number + "</num>", file=code) print(" <num>" + number + "</num>", file=code)
print(" <premiered>" + release + "</premiered>", file=code) print(" <premiered>" + release + "</premiered>", file=code)
print(" <cover>" + cover + "</cover>", file=code) print(" <cover>" + cover + "</cover>", file=code)
print(" <website>" + "https://www.javbus.com/" + number + "</website>", file=code) print(" <website>" + website + "</website>", file=code)
print("</movie>", file=code) print("</movie>", file=code)
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo") print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
elif option == 'kodi': elif option == 'kodi':
@ -470,7 +470,7 @@ def PrintFiles(option, path, c_word, naming_rule, part, cn_sub, json_data, filep
print(" <num>" + number + "</num>", file=code) print(" <num>" + number + "</num>", file=code)
print(" <release>" + release + "</release>", file=code) print(" <release>" + release + "</release>", file=code)
print(" <cover>" + cover + "</cover>", file=code) print(" <cover>" + cover + "</cover>", file=code)
print(" <website>" + "https://www.javbus.com/" + number + "</website>", file=code) print(" <website>" + website + "</website>", file=code)
print("</movie>", file=code) print("</movie>", file=code)
print("[+]Writeed! " + path + "/" + number + c_word + ".nfo") print("[+]Writeed! " + path + "/" + number + c_word + ".nfo")
except IOError as e: except IOError as e:

273
fanza.py
View File

@ -1,121 +1,226 @@
#!/usr/bin/python #!/usr/bin/python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re
from lxml import etree
import json import json
import re
from lxml import etree
from ADC_function import * from ADC_function import *
# import sys # import sys
# import io # import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser()) def getTitle(text):
html = etree.fromstring(text, etree.HTMLParser())
result = html.xpath('//*[@id="title"]/text()')[0] result = html.xpath('//*[@id="title"]/text()')[0]
return result return result
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath("//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()")).strip(" ['']").replace("', '",',') def getActor(text):
# //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(text, etree.HTMLParser())
result = (
str(
html.xpath(
"//td[contains(text(),'出演者')]/following-sibling::td/span/a/text()"
)
)
.strip(" ['']")
.replace("', '", ",")
)
return result return result
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getStudio(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result1 = html.xpath("//td[contains(text(),'メーカー')]/following-sibling::td/a/text()")[0] result = html.xpath(
"//td[contains(text(),'メーカー')]/following-sibling::td/a/text()"
)[0]
except: except:
result1 = html.xpath("//td[contains(text(),'メーカー')]/following-sibling::td/text()")[0] result = html.xpath(
return result1 "//td[contains(text(),'メーカー')]/following-sibling::td/text()"
def getRuntime(a): )[0]
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() return result
result1 = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search('\d+', str(result1)).group()
def getLabel(a): def getRuntime(text):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//td[contains(text(),'収録時間')]/following-sibling::td/text()")[0]
return re.search(r"\d+", str(result)).group()
def getLabel(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result1 = html.xpath("//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()")[0] result = html.xpath(
"//td[contains(text(),'シリーズ:')]/following-sibling::td/a/text()"
)[0]
except: except:
result1 = html.xpath("//td[contains(text(),'シリーズ:')]/following-sibling::td/text()")[0] result = html.xpath(
return result1 "//td[contains(text(),'シリーズ:')]/following-sibling::td/text()"
def getNum(a): )[0]
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() return result
def getNum(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result1 = html.xpath("//td[contains(text(),'品番:')]/following-sibling::td/a/text()")[0] result = html.xpath(
"//td[contains(text(),'品番:')]/following-sibling::td/a/text()"
)[0]
except: except:
result1 = html.xpath("//td[contains(text(),'品番:')]/following-sibling::td/text()")[0] result = html.xpath(
return result1 "//td[contains(text(),'品番:')]/following-sibling::td/text()"
)[0]
return result
def getYear(getRelease): def getYear(getRelease):
try: try:
result = str(re.search('\d{4}',getRelease).group()) result = str(re.search(r"\d{4}", getRelease).group())
return result return result
except: except:
return getRelease return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getRelease(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result1 = html.xpath("//td[contains(text(),'発売日:')]/following-sibling::td/a/text()")[0].lstrip('\n') result = html.xpath(
"//td[contains(text(),'発売日:')]/following-sibling::td/a/text()"
)[0].lstrip("\n")
except: except:
result1 = html.xpath("//td[contains(text(),'発売日:')]/following-sibling::td/text()")[0].lstrip('\n') result = html.xpath(
return result1 "//td[contains(text(),'発売日:')]/following-sibling::td/text()"
def getTag(a): )[0].lstrip("\n")
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result1 = html.xpath("//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()")
except:
result1 = html.xpath("//td[contains(text(),'ジャンル:')]/following-sibling::td/text()")
return result1
def getCover(htmlcode,number):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="'+number+'"]/@href')[0]
return result return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
def getTag(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try: try:
result1 = html.xpath("//td[contains(text(),'監督:')]/following-sibling::td/a/text()")[0] result = html.xpath(
"//td[contains(text(),'ジャンル:')]/following-sibling::td/a/text()"
)
except: except:
result1 = html.xpath("//td[contains(text(),'監督:')]/following-sibling::td/text()")[0] result = html.xpath(
return result1 "//td[contains(text(),'ジャンル:')]/following-sibling::td/text()"
def getOutline(htmlcode): )
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace('\n','')
return result return result
def getCover(text, number):
html = etree.fromstring(text, etree.HTMLParser())
cover_number = number
if "_" in cover_number:
# fanza modify _ to \u0005f for image id
cover_number = cover_number.replace("_", r"\u005f")
try:
result = html.xpath('//*[@id="' + cover_number + '"]/@href')[0]
except:
# (TODO) handle more edge case
# print(html)
# raise exception here, same behavior as before
# people's major requirement is fetching the picture
raise ValueError("can not find image")
return result
def getDirector(text):
html = etree.fromstring(text, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/a/text()"
)[0]
except:
result = html.xpath(
"//td[contains(text(),'監督:')]/following-sibling::td/text()"
)[0]
return result
def getOutline(text):
html = etree.fromstring(text, etree.HTMLParser())
try:
result = str(html.xpath("//div[@class='mg-b20 lh4']/text()")[0]).replace(
"\n", ""
)
if result == "":
result = str(html.xpath("//div[@class='mg-b20 lh4']//p/text()")[0]).replace(
"\n", ""
)
except:
# (TODO) handle more edge case
# print(html)
return ""
return result
def main(number): def main(number):
htmlcode=get_html('https://www.dmm.co.jp/digital/videoa/-/detail/=/cid='+number) # fanza allow letter + number + underscore, normalize the input here
url = 'https://www.dmm.co.jp/digital/videoa/-/detail/=/cid='+number # @note: I only find the usage of underscore as h_test123456789
if '404 Not Found' in htmlcode: fanza_search_number = number
htmlcode=get_html('https://www.dmm.co.jp/mono/dvd/-/detail/=/cid='+number) # AV_Data_Capture.py.getNumber() over format the input, restore the h_ prefix
url = 'https://www.dmm.co.jp/mono/dvd/-/detail/=/cid='+number if fanza_search_number.startswith("h-"):
if '404 Not Found' in htmlcode: fanza_search_number = fanza_search_number.replace("h-", "h_")
# neither digital nor mono is available
return json.dumps({'title': '',}) fanza_search_number = re.sub(r"[^0-9a-zA-Z_]", "", fanza_search_number).lower()
fanza_urls = [
"https://www.dmm.co.jp/digital/videoa/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=",
"https://www.dmm.co.jp/digital/anime/-/detail/=/cid=",
"https://www.dmm.co.jp/mono/anime/-/detail/=/cid=",
]
chosen_url = ""
for url in fanza_urls:
chosen_url = url + fanza_search_number
htmlcode = get_html(chosen_url)
if "404 Not Found" not in htmlcode:
break
if "404 Not Found" in htmlcode:
return json.dumps({"title": "",})
try: try:
# for some old page, the input number does not match the page # for some old page, the input number does not match the page
# for example, the url will be cid=test012 # for example, the url will be cid=test012
# but the hinban on the page is test00012 # but the hinban on the page is test00012
# so get the hinban first, and then pass it to following functions # so get the hinban first, and then pass it to following functions
number = getNum(htmlcode) fanza_hinban = getNum(htmlcode)
dic = { data = {
'title': getTitle(htmlcode).strip(getActor(htmlcode)), "title": getTitle(htmlcode).strip(getActor(htmlcode)),
'studio': getStudio(htmlcode), "studio": getStudio(htmlcode),
'outline': getOutline(htmlcode), "outline": getOutline(htmlcode),
'runtime': getRuntime(htmlcode), "runtime": getRuntime(htmlcode),
'director': getDirector(htmlcode), "director": getDirector(htmlcode) if "anime" not in chosen_url else "",
'actor': getActor(htmlcode), "actor": getActor(htmlcode) if "anime" not in chosen_url else "",
'release': getRelease(htmlcode), "release": getRelease(htmlcode),
'number': number, "number": fanza_hinban,
'cover': getCover(htmlcode, number), "cover": getCover(htmlcode, fanza_hinban),
'imagecut': 1, "imagecut": 1,
'tag': getTag(htmlcode), "tag": getTag(htmlcode),
'label':getLabel(htmlcode), "label": getLabel(htmlcode),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()), "year": getYear(
'actor_photo': '', getRelease(htmlcode)
'website': url, ), # str(re.search('\d{4}',getRelease(a)).group()),
'source': 'fanza.py', "actor_photo": "",
"website": chosen_url,
"source": "fanza.py",
} }
except : except:
dic = { data = {
'title': '', "title": "",
} }
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) # .encode('UTF-8') js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
) # .encode('UTF-8')
return js return js
# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") if __name__ == "__main__":
#print(main('ipx292')) # print(main("DV-1562"))
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
# print(main("ipx292"))
pass