112 lines
3.9 KiB
Python
112 lines
3.9 KiB
Python
import logging
|
|
import sqlite3
|
|
import threading
|
|
import time
|
|
|
|
|
|
class Converter(object):
|
|
def __init__(self, database_path, file_path):
|
|
logging.basicConfig(
|
|
level=logging.DEBUG,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
datefmt='%Y-%m-%d %H:%M:%S',
|
|
)
|
|
self.logger = logging.getLogger()
|
|
self.database_connection = None
|
|
self.database_path = database_path
|
|
self.file_path = file_path
|
|
self.file_rows = 0
|
|
self.handle_total = 0
|
|
self.handle_invalid = 0
|
|
self.handle_queue = 0
|
|
self.cancel_print_insertion_speed = None
|
|
|
|
def connect_database(self):
|
|
self.database_connection = sqlite3.connect(self.database_path)
|
|
|
|
def close_database(self):
|
|
self.database_connection.close()
|
|
|
|
def insert(self, id, name, nickname, password, email, id_number, phone_number):
|
|
cursor = self.database_connection.cursor()
|
|
try:
|
|
cursor.execute(
|
|
"INSERT INTO jd VALUES (?, ?, ?, ?, ?, ?, ?);",
|
|
(id, name, nickname, password, email, id_number, phone_number)
|
|
)
|
|
except sqlite3.IntegrityError:
|
|
self.handle_invalid += 1
|
|
finally:
|
|
self.handle_total += 1
|
|
self.handle_queue += 1
|
|
pass
|
|
|
|
def start_insertion_speed(self):
|
|
event = threading.Event()
|
|
|
|
def print_insertion_speed():
|
|
handle_total = self.handle_total
|
|
while not event.wait(1):
|
|
if self.handle_total - handle_total == 0:
|
|
continue
|
|
self.logger.info("{}/s, {}/{} progress, {} rows are invalid, {} seconds left".format(
|
|
self.handle_total - handle_total,
|
|
self.handle_total,
|
|
self.file_rows,
|
|
self.handle_invalid,
|
|
(self.file_rows - self.handle_total) / (self.handle_total - handle_total),
|
|
))
|
|
handle_total = self.handle_total
|
|
|
|
threading.Thread(target=print_insertion_speed).start()
|
|
return event.set
|
|
|
|
def start(self):
|
|
# Get the number of file rows
|
|
self.logger.info("start scanning file lines")
|
|
start_time = time.time()
|
|
with open(self.file_path) as file:
|
|
self.file_rows = 0
|
|
for _ in file:
|
|
self.file_rows += 1
|
|
end_time = time.time()
|
|
self.logger.info("scan completed, there are a total of {} lines, and it taken {} seconds".format(
|
|
self.file_rows,
|
|
end_time - start_time,
|
|
))
|
|
# Insert QQ and phone numbers
|
|
self.connect_database()
|
|
self.cancel_print_insertion_speed = self.start_insertion_speed()
|
|
with open(self.file_path) as file:
|
|
for line in file:
|
|
try:
|
|
dataset = line.strip().split("---")
|
|
name = dataset[0]
|
|
nickname = dataset[1]
|
|
password = dataset[2]
|
|
email = dataset[3]
|
|
id_number = dataset[4]
|
|
phone_number = dataset[5]
|
|
except IndexError:
|
|
self.handle_invalid += 1
|
|
pass
|
|
finally:
|
|
self.handle_total += 1
|
|
self.insert(self.handle_total, name, nickname, password, email, id_number, phone_number)
|
|
if self.handle_queue >= 400000:
|
|
self.database_connection.commit()
|
|
self.handle_queue = 0
|
|
self.database_connection.commit()
|
|
self.cancel_print_insertion_speed()
|
|
self.close_database()
|
|
self.logger.info("completed, insert {} rows, {} rows of invalid data".format(
|
|
self.handle_total,
|
|
self.handle_invalid,
|
|
))
|
|
exit()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
converter = Converter("database/database.db", "source/www_jd_com_12g.txt")
|
|
converter.start()
|