User:GemmaBot/lostfilms.py

From Wikisource
Jump to navigation Jump to search
import re
import string

# y = open('SFF.html', 'r')

# y = y.read()


# hcount = 0

# f = re.findall('href="(http://memory\.loc\.gov/diglib/ihas/loc\.mbrs\.sfdb\..+?)"', y)

# f.remove('http://memory.loc.gov/diglib/ihas/loc.mbrs.sfdb.8950/')

# x = open('allfilms.txt', 'r')

# v = x.readlines()

# z = x.read()

# # list_of_more_parens = []
# lines = []

# count = 0
# for l in v:
#   if l != '':
#     print("Line is: '" + l + "'")
#     l = l.replace("\n", "")
#     l = "'''" + l + "'''"
#     l = re.sub("'''[0-9]+? (.+?) (\(19.+?\))", "'''["+f[count]+r" \1] \2", l)
#     lines.append(l)
#   # if '(' not in l:
#   #   print("GOT HERE", l)
#   # if l.count('(') > 1:
#   #   print("GOT TO MORE THAN ONE PARENS", l)
#   #   # list_of_more_parens.append(l)
#   count += 1
  
# print(lines)
# print(len(lines))

# ultimate_string = '\n\n'.join(lines)

# w = open('newfilms.txt', 'w+')

# w.write(ultimate_string)

# w.close()

# exit()
  

# print(count)



# for 

# s = list(set(f))

# print(len(s))

# g = re.findall('[0-9]+? (.+? \(19.+?\)', z)
# # g = re.findall('\n[0-9]+? .+? \(19', z)
# # g = re.findall('1', z)


# print(g)
# print("is g")
# print(len(g))

# ultimate_string = 'a'

# w = open('newfilms.txt', 'w+')

# w.write(ultimate_string)

# w.close()


x = open('newfilms.txt', 'r')

y = x.readlines()

lines_over_140 = []

new_list = []

count_set = 46
count_till_next_page = 46
current_letter = 'A'
d_count = 0
for l in y:
  item = re.search("'''\[.+? (.+?)\]", l)
  # if item:
  if item:
    item = item.group(1)
    if item[0] in string.ascii_uppercase and item[0] != current_letter:
      current_letter = item[0]
      new_list.append(f"##{current_letter}##\n\n")
  if count_till_next_page == 0:
    new_list.append('\n{{nop}}\n')
    new_list.append('\n----\n')
    new_list.append(f"\n##{current_letter}##\n\n")
    count_till_next_page = 46
    continue
  if l != "\n":
    count_till_next_page -= 1
    # print("Counted with " + l)
  # print('Line: ' + l)
  new_list.append(l)
  if len(l) >= 137 and "<!--bot ignore-->" not in l:
    count_till_next_page -= 1
    # print("GOT TO", l)
    # d_count += 1
    # if d_count == 5:
    #   exit()
    # exit()
    # lines_over_140.append(l)


# print(lines_over_140)

# print(len(lines_over_140))

ultimate_string = ''.join(new_list)

ultimate_string = ultimate_string.replace("<!--bot ignore-->", "")

w = open('filmparse.txt', 'w+')

w.write(ultimate_string)

w.close()

I, the copyright holder of this work, hereby release it into the public domain. This applies worldwide.

In case this is not legally possible:

I grant anyone the right to use this work for any purpose, without any conditions, unless such conditions are required by law.

Public domainPublic domainfalsefalse